cuSBF
Loading...
Searching...
No Matches
Namespaces | Classes | Functions | Variables
cusbf::detail Namespace Reference

Namespaces

namespace  xxhash
 XXHash_64 implementation from.
 

Classes

struct  BitwiseOr
 
struct  SaltLiteral
 Compile-time golden-ratio-derived multiplicative salt constants. More...
 
struct  SaltLiteral< 0 >
 
struct  SaltLiteral< 1 >
 
struct  SaltLiteral< 10 >
 
struct  SaltLiteral< 11 >
 
struct  SaltLiteral< 12 >
 
struct  SaltLiteral< 13 >
 
struct  SaltLiteral< 14 >
 
struct  SaltLiteral< 15 >
 
struct  SaltLiteral< 2 >
 
struct  SaltLiteral< 3 >
 
struct  SaltLiteral< 4 >
 
struct  SaltLiteral< 5 >
 
struct  SaltLiteral< 6 >
 
struct  SaltLiteral< 7 >
 
struct  SaltLiteral< 8 >
 
struct  SaltLiteral< 9 >
 
struct  SequenceKmerInput
 Kernel input descriptor for a sequence k-mer sweep. More...
 

Functions

template<typename T >
consteval uint64_t validByteCount ()
 
template<typename T >
consteval bool separatorPositionAlwaysEncodesInvalid (char *input, uint64_t separatorPosition, uint64_t index)
 Recursively tests whether placing the separator byte at any position in an input of valid bytes always results in an invalid encoding.
 
template<typename T >
consteval bool separatorByteAlwaysEncodesInvalid ()
 Tests that for every position in the input, placing the separator byte at that position always results in an invalid encoding.
 
template<typename Config >
__global__ void containsSequenceKmersKernel (SequenceKmerInput< Config > input, device_span< const typename Filter< Config >::Shard > shards, device_span< uint8_t > output)
 CUDA kernel: queries k-mers from a sequence against the filter.
 
template<typename Config >
__device__ __forceinline__ bool prepareSequenceHashTiles (const char *sequence, uint64_t blockStartKmer, uint64_t blockKmers, uint8_t *sequenceTile)
 Cooperatively loads and encodes a tile of symbols into shared memory.
 
template<typename Config >
__global__ void insertSequenceKmersKernel (SequenceKmerInput< Config > input, device_span< typename Filter< Config >::Shard > shards)
 CUDA kernel: inserts k-mers from a sequence into the filter.
 
template<uint64_t Index>
__host__ __device__ __forceinline__ constexpr uint64_t multiplicativeSaltLiteral ()
 Returns the multiplicative salt constant for hash function Index.
 
template<typename Config , typename Fn , uint64_t... HashIndices>
__host__ __device__ __forceinline__ void forEachHashIndexImpl (Fn &&fn, std::index_sequence< HashIndices... >)
 Implementation helper for forEachHashIndex (fold-expression over an index sequence).
 
template<typename Config , typename Fn >
__host__ __device__ __forceinline__ void forEachHashIndex (Fn &&fn)
 Invokes fn for each hash index in [0, Config::hashCount) at compile time.
 
template<typename Config , uint64_t Length>
__host__ __device__ __forceinline__ constexpr uint64_t packedWindowMask ()
 Returns a bitmask covering Length packed alphabet symbols.
 
template<typename Config , uint64_t WindowLength, uint64_t K>
__host__ __device__ __forceinline__ constexpr uint64_t extractPackedSubwindow (uint64_t packedKmer, uint64_t start)
 Extracts a packed sub-window from a packed k-mer.
 
__device__ __forceinline__ void atomicOrWord (uint64_t *ptr, uint64_t value)
 Atomically ORs value into the device word at ptr.
 
template<typename Config >
__device__ __forceinline__ uint64_t packedKmerMinimizerHash (uint64_t packedKmer)
 Computes the minimizer hash for a packed k-mer.
 
template<typename Config >
__device__ __forceinline__ uint64_t packedKmerSmerHash (uint64_t packedKmer, uint64_t start)
 Computes the hash for the s-mer at position start within a packed k-mer.
 
template<typename Config >
__device__ __forceinline__ void loadShardWords4 (const typename Filter< Config >::Shard *shards, uint64_t shardIndex, uint64_t *w)
 Loads all four 64-bit words of a shard into a local array.
 
template<typename Config , uint64_t K>
__device__ __forceinline__ uint64_t packKmerFromTile (const uint8_t *tile, uint64_t start)
 Packs K symbols from a shared-memory tile into an integer.
 
template<typename Config , uint64_t K>
__device__ __forceinline__ uint64_t advancePackedKmer (uint64_t packed, uint8_t newBase)
 Slides the packed k-mer window forward by one symbol.
 
template<typename Config >
__device__ __forceinline__ bool sectorizedContainsPackedKmer (uint64_t packedKmer, const uint64_t *w)
 Tests whether a packed k-mer is present in a pre-loaded shard.
 
template<typename Config >
__device__ __forceinline__ bool kmerIsValid (const uint8_t *tile, uint64_t start)
 
constexpr __host__ __device__ __forceinline__ uint64_t hash64 (uint64_t key)
 Fast 64-bit integer hash (non-cryptographic).
 
constexpr __host__ __device__ __forceinline__ uint64_t minimizerHash64 (uint64_t key)
 Fast 64-bit hash sufficient for uniform minimizer selection.
 

Variables

constexpr uint32_t kContainsSequenceStride = 4
 
constexpr uint64_t kInvalidHash = std::numeric_limits<uint64_t>::max()
 Sentinel hash value indicating "no valid minimizer found".
 

Function Documentation

◆ advancePackedKmer()

template<typename Config , uint64_t K>
__device__ __forceinline__ uint64_t cusbf::detail::advancePackedKmer ( uint64_t  packed,
uint8_t  newBase 
)

Slides the packed k-mer window forward by one symbol.

Shifts the existing packed representation left by one symbol, inserts the new symbol in the least-significant position, and masks to K symbols.

Template Parameters
Kk-mer length.
Parameters
packedCurrent packed k-mer.
newBasePre-encoded new symbol.
Returns
Updated packed k-mer.

Definition at line 1411 of file BloomFilter.cuh.

1411 {
1412 return ((packed << Config::symbolBits) | (newBase & Config::symbolMask)) &
1413 packedWindowMask<Config, K>();
1414}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ atomicOrWord()

__device__ __forceinline__ void cusbf::detail::atomicOrWord ( uint64_t ptr,
uint64_t  value 
)

Atomically ORs value into the device word at ptr.

Parameters
ptrTarget device word.
valueValue to OR in.

Definition at line 319 of file BloomFilter.cuh.

319 {
320 atomicOr(reinterpret_cast<unsigned long long*>(ptr), static_cast<unsigned long long>(value));
321}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ containsSequenceKmersKernel()

template<typename Config >
__global__ void cusbf::detail::containsSequenceKmersKernel ( SequenceKmerInput< Config input,
device_span< const typename Filter< Config >::Shard >  shards,
device_span< uint8_t output 
)

CUDA kernel: queries k-mers from a sequence against the filter.

Each thread processes kStride consecutive k-mers to amortise packing and shard loads. Threads sharing the same shard collaborate via __match_any_sync to load the shard words once and broadcast them.

Template Parameters
ConfigFilter configuration.
Parameters
inputSequence descriptor (device span + k-mer count).
shardsDevice-resident shard array (read-only).
outputPer-k-mer result buffer (1 = present, 0 = absent).

Definition at line 1503 of file BloomFilter.cuh.

1507 {
1508 // Each thread handles this many consecutive k-mers to amortise packing
1509 constexpr uint32_t kStride = kContainsSequenceStride;
1510 constexpr uint64_t sequenceTileBases = Config::cudaBlockSize * kStride + Config::k - 1;
1511
1512 __shared__ uint8_t sequenceTile[sequenceTileBases];
1513
1514 const uint64_t numKmers = input.kmerCount();
1515 const uint64_t blockStartKmer =
1516 static_cast<uint64_t>(blockIdx.x) * Config::cudaBlockSize * kStride;
1517 if (blockStartKmer >= numKmers) {
1518 return;
1519 }
1520
1521 const uint64_t blockKmers = min(Config::cudaBlockSize * kStride, numKmers - blockStartKmer);
1522
1523 const bool blockAllValid = prepareSequenceHashTiles<Config>(
1524 input.sequence.data(), blockStartKmer, blockKmers, sequenceTile
1525 );
1526
1527 const uint64_t threadOffset = static_cast<uint64_t>(threadIdx.x) * kStride;
1528 if (threadOffset >= blockKmers) {
1529 return;
1530 }
1531
1532 // Bitmask: bit s set = k-mer at offset s is valid.
1533 uint32_t kmerValidMask = 0;
1534 _Pragma("unroll")
1535 for (uint32_t s = 0; s < kStride; ++s) {
1536 if ((threadOffset + s) < blockKmers) {
1537 kmerValidMask |= (1u << s);
1538 }
1539 }
1540
1541 if (!blockAllValid) {
1542 _Pragma("unroll")
1543 for (uint32_t s = 0; s < kStride; ++s) {
1544 if (!(kmerValidMask & (1u << s))) {
1545 continue;
1546 }
1547 const uint64_t localIdx = threadOffset + s;
1548 if (!kmerIsValid<Config>(sequenceTile, localIdx)) {
1549 kmerValidMask &= ~(1u << s);
1550 }
1551 }
1552 }
1553
1554 // Always pack from position 0. Sliding propagates the packed value forward
1555 // invalid bases from earlier k-mers are simply shifted out.
1556 uint64_t packedKmer = packKmerFromTile<Config, Config::k>(sequenceTile, threadOffset);
1557
1558 for (uint32_t s = 0; s < kStride; ++s) {
1559 const uint64_t localIdx = threadOffset + s;
1560 if (localIdx >= blockKmers) {
1561 break;
1562 }
1563
1564 const uint64_t kmerIndex = blockStartKmer + localIdx;
1565
1566 if (s > 0) {
1567 packedKmer = advancePackedKmer<Config, Config::k>(
1568 packedKmer, sequenceTile[localIdx + Config::k - 1]
1569 );
1570 }
1571
1572 if (!(kmerValidMask & (1u << s))) {
1573 output[kmerIndex] = 0;
1574 continue;
1575 }
1576
1577 const uint64_t minimizerHash = packedKmerMinimizerHash<Config>(packedKmer);
1578
1579 // Warp-level shard sharing.
1580 const auto shardIdx = static_cast<uint32_t>(minimizerHash & (shards.size() - 1));
1581 const uint32_t peers = __match_any_sync(0xFFFFFFFFu, shardIdx);
1582 const int leader = __ffs(static_cast<int>(peers)) - 1;
1583
1584 uint64_t w[4];
1585 if (static_cast<int>(threadIdx.x & 31u) == leader) {
1586 loadShardWords4<Config>(shards.data(), shardIdx, w);
1587 }
1588 w[0] = __shfl_sync(peers, w[0], leader);
1589 w[1] = __shfl_sync(peers, w[1], leader);
1590 w[2] = __shfl_sync(peers, w[2], leader);
1591 w[3] = __shfl_sync(peers, w[3], leader);
1592
1593 const bool present = sectorizedContainsPackedKmer<Config>(packedKmer, w);
1594 output[kmerIndex] = present;
1595 }
1596}
constexpr uint32_t kContainsSequenceStride
constexpr __host__ __device__ uint64_t kmerCount() const
device_span< const char > sequence

◆ extractPackedSubwindow()

template<typename Config , uint64_t WindowLength, uint64_t K>
__host__ __device__ __forceinline__ constexpr uint64_t cusbf::detail::extractPackedSubwindow ( uint64_t  packedKmer,
uint64_t  start 
)
constexpr

Extracts a packed sub-window from a packed k-mer.

Extracts WindowLength consecutive bases starting at start from a packed k-mer of length K (MSB = first base).

Template Parameters
WindowLengthLength of the sub-window to extract.
KLength of the full k-mer.
Parameters
packedKmerPacked k-mer (MSB = first base).
startZero-based start position.
Returns
Packed sub-window.

Definition at line 307 of file BloomFilter.cuh.

307 {
308 static_assert(WindowLength <= K, "WindowLength must not exceed K");
309 return (packedKmer >> (Config::symbolBits * (K - (start + WindowLength)))) &
310 packedWindowMask<Config, WindowLength>();
311}
Here is the call graph for this function:

◆ forEachHashIndex()

template<typename Config , typename Fn >
__host__ __device__ __forceinline__ void cusbf::detail::forEachHashIndex ( Fn &&  fn)

Invokes fn for each hash index in [0, Config::hashCount) at compile time.

Template Parameters
ConfigFilter configuration.
FnCallable with signature void(std::integral_constant<uint64_t, I>).
Parameters
fnCallable to invoke for each index.

Definition at line 271 of file BloomFilter.cuh.

271 {
272 forEachHashIndexImpl<Config>(
273 static_cast<Fn&&>(fn), std::make_index_sequence<Config::hashCount>{}
274 );
275}
Here is the call graph for this function:

◆ forEachHashIndexImpl()

template<typename Config , typename Fn , uint64_t... HashIndices>
__host__ __device__ __forceinline__ void cusbf::detail::forEachHashIndexImpl ( Fn &&  fn,
std::index_sequence< HashIndices... >   
)

Implementation helper for forEachHashIndex (fold-expression over an index sequence).

Definition at line 259 of file BloomFilter.cuh.

259 {
260 (fn(std::integral_constant<uint64_t, HashIndices>{}), ...);
261}
Here is the call graph for this function:

◆ hash64()

constexpr __host__ __device__ __forceinline__ uint64_t cusbf::detail::hash64 ( uint64_t  key)
constexpr

Fast 64-bit integer hash (non-cryptographic).

One multiplicative step followed by an xorshift. Used to hash s-mer packed representations for Bloom bit-position selection.

Parameters
keyInput value.
Returns
Hashed value.

Definition at line 192 of file hashutil.cuh.

192 {
193 key *= 0x9e3779b97f4a7c15ULL;
194 key ^= key >> 33;
195 return key;
196}
Here is the call graph for this function:

◆ insertSequenceKmersKernel()

template<typename Config >
__global__ void cusbf::detail::insertSequenceKmersKernel ( SequenceKmerInput< Config input,
device_span< typename Filter< Config >::Shard >  shards 
)

CUDA kernel: inserts k-mers from a sequence into the filter.

Each thread processes one k-mer. Consecutive threads targeting the same shard use cub::WarpReduce::HeadSegmentedReduce to merge bitmasks before the run head issues a minimal number of atomicOr operations.

Template Parameters
ConfigFilter configuration.
Parameters
inputSequence descriptor.
shardsDevice-resident shard array (modified in place).

Definition at line 1610 of file BloomFilter.cuh.

1613 {
1614 constexpr uint64_t sequenceTileBases = Config::cudaBlockSize + Config::k - 1;
1615 constexpr uint32_t warpSize = 32;
1616 constexpr uint32_t warpsPerBlock = Config::cudaBlockSize / warpSize;
1617
1618 using WarpReduceWord = cub::WarpReduce<uint64_t>;
1619
1620 __shared__ uint8_t sequenceTile[sequenceTileBases];
1621 __shared__ typename WarpReduceWord::TempStorage reduceStorage[warpsPerBlock][4];
1622
1623 const uint64_t numKmers = input.kmerCount();
1624 const uint64_t blockStartKmer = static_cast<uint64_t>(blockIdx.x) * Config::cudaBlockSize;
1625 if (blockStartKmer >= numKmers) {
1626 return;
1627 }
1628
1629 const uint64_t blockKmers = min(Config::cudaBlockSize, numKmers - blockStartKmer);
1630 const auto localKmerIndex = static_cast<uint64_t>(threadIdx.x);
1631 const bool inRange = localKmerIndex < blockKmers;
1632
1633 const bool blockAllValid = prepareSequenceHashTiles<Config>(
1634 input.sequence.data(), blockStartKmer, blockKmers, sequenceTile
1635 );
1636
1637 // Avoid early returns so all warp lanes can participate in the segmented
1638 // warp reductions below.
1639 bool active = inRange;
1640
1641 if (active && !blockAllValid) {
1642 active = kmerIsValid<Config>(sequenceTile, localKmerIndex);
1643 }
1644
1645 // Inactive threads keep zero masks and a per-lane sentinel shard index so
1646 // contiguous run detection naturally splits around them.
1647 uint64_t minimizerHash = 0;
1648 uint64_t wordMask0 = 0;
1649 uint64_t wordMask1 = 0;
1650 uint64_t wordMask2 = 0;
1651 uint64_t wordMask3 = 0;
1652
1653 if (active) {
1654 const uint64_t packedKmer =
1655 packKmerFromTile<Config, Config::k>(sequenceTile, localKmerIndex);
1656 minimizerHash = packedKmerMinimizerHash<Config>(packedKmer);
1657
1658 uint64_t h_s = packedKmerSmerHash<Config>(packedKmer, 0);
1659 Filter<Config>::Shard::sectorizedHashToMasks(
1660 h_s, wordMask0, wordMask1, wordMask2, wordMask3
1661 );
1662 _Pragma("unroll")
1663 for (uint64_t smerOffset = 1; smerOffset < Config::findereSpan; ++smerOffset) {
1664 h_s = packedKmerSmerHash<Config>(packedKmer, smerOffset);
1665 Filter<Config>::Shard::sectorizedHashToMasks(
1666 h_s, wordMask0, wordMask1, wordMask2, wordMask3
1667 );
1668 }
1669 }
1670
1671 // Warp-local segmented reductions: contiguous threads sharing the same
1672 // shard merge their masks so only the run head issues the atomicOrs.
1673 const auto shardIdx =
1674 static_cast<uint32_t>(active ? (minimizerHash & (shards.size() - 1)) : ~threadIdx.x);
1675
1676 const uint32_t lane = threadIdx.x & (warpSize - 1);
1677 const uint32_t warpIdx = threadIdx.x / warpSize;
1678 const uint32_t prevShardIdx = __shfl_up_sync(0xffffffff, shardIdx, 1);
1679 const bool runHead = (lane == 0) || (shardIdx != prevShardIdx);
1680 const BitwiseOr<uint64_t> bitwiseOr{};
1681
1682 wordMask0 = WarpReduceWord(reduceStorage[warpIdx][0])
1683 .HeadSegmentedReduce(wordMask0, runHead, bitwiseOr);
1684 wordMask1 = WarpReduceWord(reduceStorage[warpIdx][1])
1685 .HeadSegmentedReduce(wordMask1, runHead, bitwiseOr);
1686 wordMask2 = WarpReduceWord(reduceStorage[warpIdx][2])
1687 .HeadSegmentedReduce(wordMask2, runHead, bitwiseOr);
1688 wordMask3 = WarpReduceWord(reduceStorage[warpIdx][3])
1689 .HeadSegmentedReduce(wordMask3, runHead, bitwiseOr);
1690
1691 if (runHead && active) {
1692 auto& shard = shards[shardIdx];
1693 if (wordMask0 != 0) {
1694 atomicOrWord(&shard.words[0], wordMask0);
1695 }
1696 if (wordMask1 != 0) {
1697 atomicOrWord(&shard.words[1], wordMask1);
1698 }
1699 if (wordMask2 != 0) {
1700 atomicOrWord(&shard.words[2], wordMask2);
1701 }
1702 if (wordMask3 != 0) {
1703 atomicOrWord(&shard.words[3], wordMask3);
1704 }
1705 }
1706}
__device__ __forceinline__ void atomicOrWord(uint64_t *ptr, uint64_t value)
Atomically ORs value into the device word at ptr.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ kmerIsValid()

template<typename Config >
__device__ __forceinline__ bool cusbf::detail::kmerIsValid ( const uint8_t tile,
uint64_t  start 
)

Definition at line 1447 of file BloomFilter.cuh.

1447 {
1448 _Pragma("unroll")
1449 for (uint64_t i = 0; i < Config::k; ++i) {
1450 if (tile[start + i] == Config::Alphabet::invalidSymbol) {
1451 return false;
1452 }
1453 }
1454 return true;
1455}
Compile-time configuration for a cusbf::Filter.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ loadShardWords4()

template<typename Config >
__device__ __forceinline__ void cusbf::detail::loadShardWords4 ( const typename Filter< Config >::Shard *  shards,
uint64_t  shardIndex,
uint64_t w 
)

Loads all four 64-bit words of a shard into a local array.

On sm_100+ issues a single 256-bit non-coherent global load, on older architectures falls back to two 128-bit loads.

Template Parameters
ConfigFilter configuration.
Parameters
shardsPointer to the device shard array.
shardIndexIndex of the shard to load.
wOutput array of (at least) four words.

Definition at line 1372 of file BloomFilter.cuh.

1372 {
1373#if __CUDA_ARCH__ >= 1000
1374 detail::load256BitGlobalNC(shards[shardIndex].words, w[0], w[1], w[2], w[3]);
1375#else
1376 detail::load128BitGlobalNC(shards[shardIndex].words + 0, w[0], w[1]);
1377 detail::load128BitGlobalNC(shards[shardIndex].words + 2, w[2], w[3]);
1378#endif
1379}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ minimizerHash64()

constexpr __host__ __device__ __forceinline__ uint64_t cusbf::detail::minimizerHash64 ( uint64_t  key)
constexpr

Fast 64-bit hash sufficient for uniform minimizer selection.

A single Knuth multiplicative step — provides enough uniformity for shard selection without the full avalanche quality of hash64.

Parameters
keyPacked m-mer input.
Returns
Hash value used to select the minimum (minimizer).

Definition at line 209 of file hashutil.cuh.

209 {
210 return key * 0x9E3779B97F4A7C15ULL;
211}
Here is the call graph for this function:

◆ multiplicativeSaltLiteral()

template<uint64_t Index>
__host__ __device__ __forceinline__ constexpr uint64_t cusbf::detail::multiplicativeSaltLiteral ( )
constexpr

Returns the multiplicative salt constant for hash function Index.

Template Parameters
IndexSalt index in [0, 15].
Returns
Salt value.

Definition at line 251 of file BloomFilter.cuh.

251 {
252 static_assert(Index < 16, "Salt index out of range");
254}
Compile-time golden-ratio-derived multiplicative salt constants.
Here is the call graph for this function:

◆ packedKmerMinimizerHash()

template<typename Config >
__device__ __forceinline__ uint64_t cusbf::detail::packedKmerMinimizerHash ( uint64_t  packedKmer)

Computes the minimizer hash for a packed k-mer.

Iterates over all m-mers within the k-mer and returns the minimum hash value, which is used to select the target shard.

Template Parameters
ConfigFilter configuration.
Parameters
packedKmer2-bit packed k-mer.
Returns
Minimizer hash value.

Definition at line 1332 of file BloomFilter.cuh.

1332 {
1333 uint64_t minimizerHash = kInvalidHash;
1334 _Pragma("unroll")
1335 for (uint64_t offset = 0; offset < Config::minimizerSpan; ++offset) {
1336 const uint64_t packedMmer =
1337 extractPackedSubwindow<Config, Config::m, Config::k>(packedKmer, offset);
1338 minimizerHash = min(minimizerHash, detail::minimizerHash64(packedMmer));
1339 }
1340 return minimizerHash;
1341}
constexpr uint64_t kInvalidHash
Sentinel hash value indicating "no valid minimizer found".
Here is the call graph for this function:
Here is the caller graph for this function:

◆ packedKmerSmerHash()

template<typename Config >
__device__ __forceinline__ uint64_t cusbf::detail::packedKmerSmerHash ( uint64_t  packedKmer,
uint64_t  start 
)

Computes the hash for the s-mer at position start within a packed k-mer.

Template Parameters
ConfigFilter configuration.
Parameters
packedKmer2-bit packed k-mer.
startZero-based start position of the s-mer within the k-mer.
Returns
Hash of the s-mer.

Definition at line 1353 of file BloomFilter.cuh.

1353 {
1354 const uint64_t packedSmer =
1355 extractPackedSubwindow<Config, Config::s, Config::k>(packedKmer, start);
1356 return detail::hash64(packedSmer);
1357}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ packedWindowMask()

template<typename Config , uint64_t Length>
__host__ __device__ __forceinline__ constexpr uint64_t cusbf::detail::packedWindowMask ( )
constexpr

Returns a bitmask covering Length packed alphabet symbols.

Returns UINT64_MAX when the packed window consumes all 64 bits.

Template Parameters
LengthNumber of symbols.

Definition at line 285 of file BloomFilter.cuh.

285 {
286 if constexpr (Length * Config::symbolBits >= 64) {
287 return std::numeric_limits<uint64_t>::max();
288 } else {
289 return (uint64_t{1} << (Config::symbolBits * Length)) - 1;
290 }
291}
Here is the call graph for this function:

◆ packKmerFromTile()

template<typename Config , uint64_t K>
__device__ __forceinline__ uint64_t cusbf::detail::packKmerFromTile ( const uint8_t tile,
uint64_t  start 
)

Packs K symbols from a shared-memory tile into an integer.

Template Parameters
Kk-mer length.
Parameters
tileEncoded symbol tile in shared memory.
startStart position within the tile.
Returns
Packed k-mer.

Definition at line 1390 of file BloomFilter.cuh.

1390 {
1391 uint64_t packed = 0;
1392 _Pragma("unroll")
1393 for (uint64_t i = 0; i < K; ++i) {
1394 packed = (packed << Config::symbolBits) | (tile[start + i] & Config::symbolMask);
1395 }
1396 return packed;
1397}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ prepareSequenceHashTiles()

template<typename Config >
__device__ __forceinline__ bool cusbf::detail::prepareSequenceHashTiles ( const char sequence,
uint64_t  blockStartKmer,
uint64_t  blockKmers,
uint8_t sequenceTile 
)

Cooperatively loads and encodes a tile of symbols into shared memory.

All threads in the block participate. The return value (via __syncthreads_count) is true only if every base in the tile is a valid alphabet symbol.

Template Parameters
ConfigFilter configuration.
Parameters
sequenceDevice-resident sequence pointer.
blockStartKmerIndex of the first k-mer assigned to this block.
blockKmersNumber of k-mers handled by this block.
sequenceTileShared-memory output buffer (blockKmers + k - 1 bytes).
Returns
true if no invalid symbols are present in the tile.

Definition at line 1472 of file BloomFilter.cuh.

1477 {
1478 const uint64_t tileBases = blockKmers + Config::k - 1;
1479
1480 bool localInvalidBase = false;
1481 for (uint64_t idx = threadIdx.x; idx < tileBases; idx += Config::cudaBlockSize) {
1482 const uint8_t encodedBase =
1483 Config::Alphabet::encode(sequence + (blockStartKmer + idx) * Config::symbolWidth);
1484 sequenceTile[idx] = encodedBase;
1485 localInvalidBase |= (encodedBase == Config::Alphabet::invalidSymbol);
1486 }
1487 return __syncthreads_count(localInvalidBase) == 0;
1488}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ sectorizedContainsPackedKmer()

template<typename Config >
__device__ __forceinline__ bool cusbf::detail::sectorizedContainsPackedKmer ( uint64_t  packedKmer,
const uint64_t w 
)

Tests whether a packed k-mer is present in a pre-loaded shard.

Checks all s-mer hashes across the k-mer against the four shard words. Returns false as soon as any required bit is absent.

Template Parameters
ConfigFilter configuration.
Parameters
packedKmerPacked k-mer to query.
wThe four pre-loaded shard words.
Returns
true if all required bits are set.

Definition at line 1429 of file BloomFilter.cuh.

1429 {
1430 bool present = true;
1431 _Pragma("unroll")
1432 for (uint64_t smerOffset = 0; smerOffset < Config::findereSpan; ++smerOffset) {
1433 const uint64_t smerHash = packedKmerSmerHash<Config>(packedKmer, smerOffset);
1434 detail::forEachHashIndex<Config>(
1435 [&]<uint64_t HashIndex>(std::integral_constant<uint64_t, HashIndex>) {
1436 constexpr uint64_t s = HashIndex % Config::blockWordCount;
1437 const uint64_t bitPos =
1438 Filter<Config>::Shard::template sectorizedBitAddress<HashIndex>(smerHash);
1439 present &= ((w[s] >> bitPos) & 1) != 0;
1440 }
1441 );
1442 }
1443 return present;
1444}
cuSBF GPU-accelerated sectorized Bloom filter.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ separatorByteAlwaysEncodesInvalid()

template<typename T >
consteval bool cusbf::detail::separatorByteAlwaysEncodesInvalid ( )

Tests that for every position in the input, placing the separator byte at that position always results in an invalid encoding.

This is a necessary condition for the separator to function correctly when concatenating sequences, as it prevents the creation of valid symbols that span across sequence boundaries.

Template Parameters
TAlphabet type to test.
Returns
bool True if the separator byte always produces an invalid encoding at every position, false if any position allows the separator to be part of a valid encoding.

Definition at line 67 of file Alphabet.cuh.

67 {
68 for (uint64_t separatorPosition = 0; separatorPosition < T::symbolWidth; ++separatorPosition) {
69 char input[T::symbolWidth]{};
70 if (!separatorPositionAlwaysEncodesInvalid<T>(input, separatorPosition, 0)) {
71 return false;
72 }
73 }
74 return true;
75}
Here is the call graph for this function:

◆ separatorPositionAlwaysEncodesInvalid()

template<typename T >
consteval bool cusbf::detail::separatorPositionAlwaysEncodesInvalid ( char *  input,
uint64_t  separatorPosition,
uint64_t  index 
)

Recursively tests whether placing the separator byte at any position in an input of valid bytes always results in an invalid encoding.

This ensures that the separator cannot be confused with valid symbols when concatenating sequences.

Template Parameters
TAlphabet type to test.
Parameters
inputBuffer to construct input strings for encoding. Must have length at least T::symbolWidth.
separatorPositionPosition at which to place the separator byte in the input.
indexCurrent index being set in the input. Should be called with 0 initially.
Returns
bool True if the separator byte always produces an invalid encoding, false if any combination of valid bytes with the separator produces a valid encoding.

Definition at line 37 of file Alphabet.cuh.

37 {
38 if (index == T::symbolWidth) {
39 return T::encode(input) == static_cast<uint8_t>(T::invalidSymbol);
40 }
41
42 if (index == separatorPosition) {
43 input[index] = static_cast<char>(T::separator);
44 return separatorPositionAlwaysEncodesInvalid<T>(input, separatorPosition, index + 1);
45 }
46
47 for (uint64_t byteIndex = 0; byteIndex < validByteCount<T>(); ++byteIndex) {
48 input[index] = T::validBytes[byteIndex];
49 if (!separatorPositionAlwaysEncodesInvalid<T>(input, separatorPosition, index + 1)) {
50 return false;
51 }
52 }
53 return true;
54}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ validByteCount()

template<typename T >
consteval uint64_t cusbf::detail::validByteCount ( )

Definition at line 14 of file Alphabet.cuh.

14 {
15 uint64_t count = 0;
16 while (T::validBytes[count] != '\0') {
17 ++count;
18 }
19 return count;
20}

Variable Documentation

◆ kContainsSequenceStride

constexpr uint32_t cusbf::detail::kContainsSequenceStride = 4
inlineconstexpr

Definition at line 143 of file BloomFilter.cuh.

◆ kInvalidHash

constexpr uint64_t cusbf::detail::kInvalidHash = std::numeric_limits<uint64_t>::max()
inlineconstexpr

Sentinel hash value indicating "no valid minimizer found".

Definition at line 167 of file BloomFilter.cuh.