A multi-GPU implementation of the Cuckoo Filter. More...

Classes
struct	Partitioner
	Functor for partitioning keys across GPUs. More...

Public Types
using	T = typename Config::KeyType

Public Member Functions
	FilterMultiGPU (size_t numGPUs, size_t capacity, float memFactor=defaultMemoryFactor)
	Constructs a new FilterMultiGPU with default transfer plan.

	FilterMultiGPU (size_t numGPUs, size_t capacity, const char *transferPlanPath, float memFactor=defaultMemoryFactor)
	Constructs a new FilterMultiGPU with custom transfer plan.

	~FilterMultiGPU ()
	Destroys the FilterMultiGPU.

	FilterMultiGPU (const FilterMultiGPU &)=delete

FilterMultiGPU &	operator= (const FilterMultiGPU &)=delete

size_t	insertMany (const T h_keys, size_t n, bool h_output=nullptr)
	Inserts a batch of keys into the distributed filter.

void	containsMany (const T h_keys, size_t n, bool h_output)
	Checks for the presence of multiple keys in the filter.

size_t	deleteMany (const T h_keys, size_t n, bool h_output=nullptr)
	Deletes multiple keys from the filter.

float	loadFactor () const
	Calculates the global load factor.

template<typename Func >
void	parallelForGPUs (Func func) const
	Executes a function in parallel across all GPUs.

void	synchronizeAllGPUs ()
	Synchronizes all GPU streams used by this filter.

size_t	totalOccupiedSlots () const
	Returns the total number of occupied slots across all GPUs.

void	clear ()
	Clears all filters on all GPUs.

size_t	totalCapacity () const
	Returns the total capacity of the distributed filter.

size_t	sizeInBytes () const

size_t	insertMany (const thrust::host_vector< T > &h_keys, thrust::host_vector< bool > &h_output)
	Inserts keys from a Thrust host vector.

size_t	insertMany (const thrust::host_vector< T > &h_keys, thrust::host_vector< uint8_t > &h_output)
	Inserts keys from a Thrust host vector (uint8_t output).

size_t	insertMany (const thrust::host_vector< T > &h_keys)
	Inserts keys from a Thrust host vector without outputting results.

void	containsMany (const thrust::host_vector< T > &h_keys, thrust::host_vector< bool > &h_output)
	Checks for existence of keys in a Thrust host vector.

void	containsMany (const thrust::host_vector< T > &h_keys, thrust::host_vector< uint8_t > &h_output)
	Checks for existence of keys in a Thrust host vector (uint8_t output).

size_t	deleteMany (const thrust::host_vector< T > &h_keys, thrust::host_vector< bool > &h_output)
	Deletes keys in a Thrust host vector.

size_t	deleteMany (const thrust::host_vector< T > &h_keys, thrust::host_vector< uint8_t > &h_output)
	Deletes keys in a Thrust host vector (uint8_t output).

size_t	deleteMany (const thrust::host_vector< T > &h_keys)
	Deletes keys in a Thrust host vector without outputting results.

Static Public Attributes
static constexpr float	defaultMemoryFactor = 0.8f
	Default fraction of free GPU memory to use for buffers (after filter allocation)

Detailed Description

template<typename Config>
class cuckoogpu::FilterMultiGPU< Config >

A multi-GPU implementation of the Cuckoo Filter.

This class partitions keys across multiple GPUs using the gossip library for efficient multi-GPU communication. It handles data distribution using gossip's multisplit and all-to-all primitives, and aggregates results.

Template Parameters

Config The configuration structure for the Cuckoo Filter.

Definition at line 37 of file CuckooFilterMultiGPU.cuh.

Member Typedef Documentation

◆ T

template<typename Config >

using cuckoogpu::FilterMultiGPU< Config >::T = typename Config::KeyType

Definition at line 39 of file CuckooFilterMultiGPU.cuh.

Constructor & Destructor Documentation

◆ FilterMultiGPU() [1/3]

template<typename Config >

cuckoogpu::FilterMultiGPU< Config >::FilterMultiGPU	(	size_t	numGPUs,
		size_t	capacity,
		float	memFactor = `defaultMemoryFactor`
	)

inline

Constructs a new FilterMultiGPU with default transfer plan.

Initializes gossip context, multisplit, all-to-all primitives, and Filter instances on each available GPU.

Parameters

numGPUs	Number of GPUs to use.
capacity	Total capacity of the distributed filter.
memFactor	Fraction of free GPU memory to use for buffers.

Definition at line 324 of file CuckooFilterMultiGPU.cuh.

        : numGPUs(numGPUs),
          capacityPerGPU(static_cast<size_t>(SDIV(capacity, numGPUs) * 1.02)),
          memoryFactor(memFactor),
          gossipContext(numGPUs),
          multisplit(gossipContext),
          all2all(gossipContext, gossip::all2all::default_plan(numGPUs)),
          all2allResults(gossipContext, gossip::all2all::default_plan(numGPUs)),
          srcBuffers(numGPUs, nullptr),
          dstBuffers(numGPUs, nullptr),
          bufferCapacities(numGPUs, 0),
          resultSrcBuffers(numGPUs, nullptr),
          resultDstBuffers(numGPUs, nullptr),
          totalBufferCapacity(0) {
        assert(numGPUs > 0 && "Number of GPUs must be at least 1");
 
        filters.resize(numGPUs);
 
        for (size_t i = 0; i < numGPUs; ++i) {
            CUDA_CALL(cudaSetDevice(gossipContext.get_device_id(i)));
            Filter<Config>* filter;
            CUDA_CALL(cudaMallocManaged(&filter, sizeof(Filter<Config>)));
            new (filter) Filter<Config>(capacityPerGPU);
            filters[i] = filter;
        }
        gossipContext.sync_hard();
 
        allocateBuffers();
    }

◆ FilterMultiGPU() [2/3]

template<typename Config >

cuckoogpu::FilterMultiGPU< Config >::FilterMultiGPU	(	size_t	numGPUs,
		size_t	capacity,
		const char *	transferPlanPath,
		float	memFactor = `defaultMemoryFactor`
	)

inline

Constructs a new FilterMultiGPU with custom transfer plan.

Initializes gossip context, multisplit, all-to-all primitives with provided transfer plan loaded from file, and Filter instances on each available GPU.

Parameters

numGPUs	Number of GPUs to use.
capacity	Total capacity of the distributed filter.
transferPlanPath	Path to gossip transfer plan file for optimized topology-aware transfers.
memFactor	Fraction of free GPU memory to use for buffers.

Definition at line 366 of file CuckooFilterMultiGPU.cuh.

        : numGPUs(numGPUs),
          capacityPerGPU(static_cast<size_t>(SDIV(capacity, numGPUs) * 1.02)),
          memoryFactor(memFactor),
          gossipContext(numGPUs),
          multisplit(gossipContext),
          all2all(
              gossipContext,
              [&]() {
                  auto plan = parse_plan(transferPlanPath);
                  if (plan.num_gpus() == 0) {
                      return gossip::all2all::default_plan(numGPUs);
                  }
                  return plan;
              }()
          ),
          all2allResults(
              gossipContext,
              [&]() {
                  auto plan = parse_plan(transferPlanPath);
                  if (plan.num_gpus() == 0) {
                      return gossip::all2all::default_plan(numGPUs);
                  }
                  return plan;
              }()
          ),
          srcBuffers(numGPUs, nullptr),
          dstBuffers(numGPUs, nullptr),
          bufferCapacities(numGPUs, 0),
          resultSrcBuffers(numGPUs, nullptr),
          resultDstBuffers(numGPUs, nullptr),
          totalBufferCapacity(0) {
        assert(numGPUs > 0 && "Number of GPUs must be at least 1");
 
        filters.resize(numGPUs);
 
        for (size_t i = 0; i < numGPUs; ++i) {
            CUDA_CALL(cudaSetDevice(gossipContext.get_device_id(i)));
            Filter<Config>* filter;
            CUDA_CALL(cudaMallocManaged(&filter, sizeof(Filter<Config>)));
            new (filter) Filter<Config>(capacityPerGPU);
            filters[i] = filter;
        }
        gossipContext.sync_hard();
 
        allocateBuffers();
    }

◆ ~FilterMultiGPU()

template<typename Config >

cuckoogpu::FilterMultiGPU< Config >::~FilterMultiGPU ( )

inline

Destroys the FilterMultiGPU.

Cleans up filter instances and pre-allocated buffers.

Definition at line 424 of file CuckooFilterMultiGPU.cuh.

                      {
        freeBuffers();
        for (size_t i = 0; i < numGPUs; ++i) {
            CUDA_CALL(cudaSetDevice(gossipContext.get_device_id(i)));
            filters[i]->~Filter<Config>();
            CUDA_CALL(cudaFree(filters[i]));
        }
    }

◆ FilterMultiGPU() [3/3]

template<typename Config >

cuckoogpu::FilterMultiGPU< Config >::FilterMultiGPU ( const FilterMultiGPU< Config > & )

delete

Member Function Documentation

◆ clear()

template<typename Config >

void cuckoogpu::FilterMultiGPU< Config >::clear ( )

inline

Clears all filters on all GPUs.

Definition at line 577 of file CuckooFilterMultiGPU.cuh.

                 {
        parallelForGPUs([&](size_t i) { filters[i]->clear(); });
    }

Here is the call graph for this function:

◆ containsMany() [1/3]

template<typename Config >

void cuckoogpu::FilterMultiGPU< Config >::containsMany	(	const T *	h_keys,
		size_t	n,
		bool *	h_output
	)

inline

Checks for the presence of multiple keys in the filter.

Parameters

h_keys	Pointer to host memory containing keys to check.
n	Number of keys to check.
h_output	Pointer to host memory to store results (true if present, false otherwise).

Definition at line 477 of file CuckooFilterMultiGPU.cuh.

                                                                 {
        executeOperation<false, true>(
            h_keys,
            n,
            h_output,
            [](Filter<Config>* filter,
               const T* keys,
               bool* results,
               size_t count,
               cudaStream_t stream) { filter->containsMany(keys, count, results, stream); }
        );
    }

Here is the call graph for this function:

Here is the caller graph for this function:

◆ containsMany() [2/3]

template<typename Config >

void cuckoogpu::FilterMultiGPU< Config >::containsMany	(	const thrust::host_vector< T > &	h_keys,
		thrust::host_vector< bool > &	h_output
	)

inline

Checks for existence of keys in a Thrust host vector.

Parameters

h_keys	Vector of keys to check.
h_output	Vector to store results (bool). Resized if necessary.

Definition at line 646 of file CuckooFilterMultiGPU.cuh.

                                                                                             {
        h_output.resize(h_keys.size());
        containsMany(
            thrust::raw_pointer_cast(h_keys.data()),
            h_keys.size(),
            thrust::raw_pointer_cast(h_output.data())
        );
    }

Here is the call graph for this function:

◆ containsMany() [3/3]

template<typename Config >

void cuckoogpu::FilterMultiGPU< Config >::containsMany	(	const thrust::host_vector< T > &	h_keys,
		thrust::host_vector< uint8_t > &	h_output
	)

inline

Checks for existence of keys in a Thrust host vector (uint8_t output).

Parameters

h_keys	Vector of keys to check.
h_output	Vector to store results (uint8_t). Resized if necessary.

Definition at line 661 of file CuckooFilterMultiGPU.cuh.

                                                                                           {
        h_output.resize(h_keys.size());
        containsMany(
            thrust::raw_pointer_cast(h_keys.data()),
            h_keys.size(),
            reinterpret_cast<bool*>(thrust::raw_pointer_cast(h_output.data()))
        );
    }

Here is the call graph for this function:

◆ deleteMany() [1/4]

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::deleteMany	(	const T *	h_keys,
		size_t	n,
		bool *	h_output = `nullptr`
	)

inline

Deletes multiple keys from the filter.

Parameters

h_keys	Pointer to host memory containing keys to delete.
n	Number of keys to delete.
h_output	Optional pointer to host memory to store results (true if found and deleted).

Returns: The total number of occupied slots across all GPUs after deletion.

Definition at line 497 of file CuckooFilterMultiGPU.cuh.

                                                                           {
        if (h_output) {
            return executeOperation<true, true>(
                h_keys,
                n,
                h_output,
                [](Filter<Config>* filter,
                   const T* keys,
                   bool* results,
                   size_t count,
                   cudaStream_t stream) { filter->deleteMany(keys, count, results, stream); }
            );
        } else {
            return executeOperation<true, false>(
                h_keys,
                n,
                nullptr,
                [](Filter<Config>* filter,
                   const T* keys,
                   bool* /*unused results*/,
                   size_t count,
                   cudaStream_t stream) { filter->deleteMany(keys, count, nullptr, stream); }
            );
        }
    }

Here is the call graph for this function:

Here is the caller graph for this function:

◆ deleteMany() [2/4]

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::deleteMany ( const thrust::host_vector< T > & h_keys )

inline

Deletes keys in a Thrust host vector without outputting results.

Parameters

h_keys Vector of keys to delete.

Returns: size_t Total number of occupied slots.

Definition at line 706 of file CuckooFilterMultiGPU.cuh.

                                                          {
        return deleteMany(thrust::raw_pointer_cast(h_keys.data()), h_keys.size(), nullptr);
    }

Here is the call graph for this function:

◆ deleteMany() [3/4]

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::deleteMany	(	const thrust::host_vector< T > &	h_keys,
		thrust::host_vector< bool > &	h_output
	)

inline

Deletes keys in a Thrust host vector.

Parameters

h_keys	Vector of keys to delete.
h_output	Vector to store results (bool). Resized if necessary.

Returns: size_t Total number of occupied slots.

Definition at line 676 of file CuckooFilterMultiGPU.cuh.

                                                                                             {
        h_output.resize(h_keys.size());
        return deleteMany(
            thrust::raw_pointer_cast(h_keys.data()),
            h_keys.size(),
            thrust::raw_pointer_cast(h_output.data())
        );
    }

Here is the call graph for this function:

◆ deleteMany() [4/4]

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::deleteMany	(	const thrust::host_vector< T > &	h_keys,
		thrust::host_vector< uint8_t > &	h_output
	)

inline

Deletes keys in a Thrust host vector (uint8_t output).

Parameters

h_keys	Vector of keys to delete.
h_output	Vector to store results (uint8_t). Resized if necessary.

Returns: size_t Total number of occupied slots.

Definition at line 692 of file CuckooFilterMultiGPU.cuh.

                                                                                         {
        h_output.resize(h_keys.size());
        return deleteMany(
            thrust::raw_pointer_cast(h_keys.data()),
            h_keys.size(),
            reinterpret_cast<bool*>(thrust::raw_pointer_cast(h_output.data()))
        );
    }

Here is the call graph for this function:

◆ insertMany() [1/4]

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::insertMany	(	const T *	h_keys,
		size_t	n,
		bool *	h_output = `nullptr`
	)

inline

Inserts a batch of keys into the distributed filter.

Uses gossip primitives for efficient multi-GPU data distribution.

Parameters

h_keys	Pointer to host memory containing keys to insert.
n	Number of keys to insert.
h_output	Optional pointer to host memory to store results (true if successfully inserted).

Returns: The total number of occupied slots across all GPUs after insertion.

Definition at line 445 of file CuckooFilterMultiGPU.cuh.

                                                                           {
        if (h_output) {
            return executeOperation<true, true>(
                h_keys,
                n,
                h_output,
                [](Filter<Config>* filter,
                   const T* keys,
                   bool* results,
                   size_t count,
                   cudaStream_t stream) { filter->insertMany(keys, count, results, stream); }
            );
        } else {
            return executeOperation<true, false>(
                h_keys,
                n,
                nullptr,
                [](Filter<Config>* filter,
                   const T* keys,
                   bool* /*unused results*/,
                   size_t count,
                   cudaStream_t stream) { filter->insertMany(keys, count, nullptr, stream); }
            );
        }
    }

Here is the call graph for this function:

Here is the caller graph for this function:

◆ insertMany() [2/4]

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::insertMany ( const thrust::host_vector< T > & h_keys )

inline

Inserts keys from a Thrust host vector without outputting results.

Parameters

h_keys Vector of keys to insert.

Returns: size_t Total number of occupied slots.

Definition at line 637 of file CuckooFilterMultiGPU.cuh.

                                                          {
        return insertMany(thrust::raw_pointer_cast(h_keys.data()), h_keys.size(), nullptr);
    }

Here is the call graph for this function:

◆ insertMany() [3/4]

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::insertMany	(	const thrust::host_vector< T > &	h_keys,
		thrust::host_vector< bool > &	h_output
	)

inline

Inserts keys from a Thrust host vector.

Parameters

h_keys	Vector of keys to insert.
h_output	Vector to store results (bool). Resized if necessary.

Returns: size_t Total number of occupied slots.

Definition at line 607 of file CuckooFilterMultiGPU.cuh.

                                                                                             {
        h_output.resize(h_keys.size());
        return insertMany(
            thrust::raw_pointer_cast(h_keys.data()),
            h_keys.size(),
            thrust::raw_pointer_cast(h_output.data())
        );
    }

Here is the call graph for this function:

◆ insertMany() [4/4]

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::insertMany	(	const thrust::host_vector< T > &	h_keys,
		thrust::host_vector< uint8_t > &	h_output
	)

inline

Inserts keys from a Thrust host vector (uint8_t output).

Parameters

h_keys	Vector of keys to insert.
h_output	Vector to store results (uint8_t). Resized if necessary.

Returns: size_t Total number of occupied slots.

Definition at line 623 of file CuckooFilterMultiGPU.cuh.

                                                                                         {
        h_output.resize(h_keys.size());
        return insertMany(
            thrust::raw_pointer_cast(h_keys.data()),
            h_keys.size(),
            reinterpret_cast<bool*>(thrust::raw_pointer_cast(h_output.data()))
        );
    }

Here is the call graph for this function:

◆ loadFactor()

template<typename Config >

float cuckoogpu::FilterMultiGPU< Config >::loadFactor ( ) const

inline

Calculates the global load factor.

Returns: float Load factor (total occupied / total capacity).

Definition at line 527 of file CuckooFilterMultiGPU.cuh.

                                           {
        return static_cast<float>(totalOccupiedSlots()) / static_cast<float>(totalCapacity());
    }

Here is the call graph for this function:

◆ operator=()

template<typename Config >

FilterMultiGPU & cuckoogpu::FilterMultiGPU< Config >::operator= ( const FilterMultiGPU< Config > & )

delete

◆ parallelForGPUs()

template<typename Config >

template<typename Func >

void cuckoogpu::FilterMultiGPU< Config >::parallelForGPUs ( Func func ) const

inline

Executes a function in parallel across all GPUs.

Spawns a thread for each GPU to run the provided function.

Template Parameters

Func	Type of the function to execute.

Parameters

func	The function to execute, taking the GPU index as an argument.

Definition at line 540 of file CuckooFilterMultiGPU.cuh.

                                          {
        std::vector<std::thread> threads;
        for (size_t i = 0; i < numGPUs; ++i) {
            threads.emplace_back([=, this]() {
                CUDA_CALL(cudaSetDevice(gossipContext.get_device_id(i)));
                func(i);
            });
        }
 
        for (auto& t : threads) {
            t.join();
        }
    }

Here is the caller graph for this function:

◆ sizeInBytes()

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::sizeInBytes ( ) const

inline

Definition at line 593 of file CuckooFilterMultiGPU.cuh.

                                             {
        std::atomic<size_t> total(0);
        parallelForGPUs([&](size_t i) {
            total.fetch_add(filters[i]->sizeInBytes(), std::memory_order_relaxed);
        });
        return total.load();
    }

Here is the call graph for this function:

Here is the caller graph for this function:

◆ synchronizeAllGPUs()

template<typename Config >

void cuckoogpu::FilterMultiGPU< Config >::synchronizeAllGPUs ( )

inline

Synchronizes all GPU streams used by this filter.

Definition at line 557 of file CuckooFilterMultiGPU.cuh.

                              {
        gossipContext.sync_all_streams();
    }

◆ totalCapacity()

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::totalCapacity ( ) const

inline

Returns the total capacity of the distributed filter.

Returns: size_t Total capacity.

Definition at line 585 of file CuckooFilterMultiGPU.cuh.

                                               {
        std::atomic<size_t> total(0);
        parallelForGPUs([&](size_t i) {
            total.fetch_add(filters[i]->capacity(), std::memory_order_relaxed);
        });
        return total.load();
    }

Here is the call graph for this function:

Here is the caller graph for this function:

◆ totalOccupiedSlots()

template<typename Config >

size_t cuckoogpu::FilterMultiGPU< Config >::totalOccupiedSlots ( ) const

inline

Returns the total number of occupied slots across all GPUs.

Returns: size_t Total occupied slots.

Definition at line 565 of file CuckooFilterMultiGPU.cuh.

                                                    {
        std::atomic<size_t> total(0);
        parallelForGPUs([&](size_t i) {
            total.fetch_add(filters[i]->occupiedSlots(), std::memory_order_relaxed);
        });
 
        return total.load();
    }

Here is the call graph for this function:

Here is the caller graph for this function:

Member Data Documentation

◆ defaultMemoryFactor

template<typename Config >

constexpr float cuckoogpu::FilterMultiGPU< Config >::defaultMemoryFactor = 0.8f

staticconstexpr

Default fraction of free GPU memory to use for buffers (after filter allocation)

Definition at line 57 of file CuckooFilterMultiGPU.cuh.

The documentation for this class was generated from the following file:

CuckooFilterMultiGPU.cuh

Classes

Public Types

Public Member Functions

Static Public Attributes

Detailed Description

Member Typedef Documentation

◆ T

Constructor & Destructor Documentation

◆ FilterMultiGPU() [1/3]

◆ FilterMultiGPU() [2/3]

◆ ~FilterMultiGPU()

◆ FilterMultiGPU() [3/3]

Member Function Documentation

◆ clear()

◆ containsMany() [1/3]

◆ containsMany() [2/3]

◆ containsMany() [3/3]

◆ deleteMany() [1/4]

◆ deleteMany() [2/4]

◆ deleteMany() [3/4]

◆ deleteMany() [4/4]

◆ insertMany() [1/4]

◆ insertMany() [2/4]

◆ insertMany() [3/4]

◆ insertMany() [4/4]

◆ loadFactor()

◆ operator=()

◆ parallelForGPUs()

◆ sizeInBytes()

◆ synchronizeAllGPUs()

◆ totalCapacity()

◆ totalOccupiedSlots()

Member Data Documentation

◆ defaultMemoryFactor