cuSBF
Loading...
Searching...
No Matches
BloomFilter.cuh
Go to the documentation of this file.
1#pragma once
2
3#include <cuda/__cmath/ceil_div.h>
4#include <cuda_runtime.h>
5
6#include <cuda/std/bit>
7#include <cuda/std/span>
8#include <cuda/stream>
9
10#include <cub/warp/warp_reduce.cuh>
11
12#include <thrust/copy.h>
13#include <thrust/detail/execution_policy.h>
14#include <thrust/device_vector.h>
15#include <thrust/execution_policy.h>
16#include <thrust/fill.h>
17#include <thrust/transform_reduce.h>
18
19#include <algorithm>
20#include <concepts>
21#include <cstddef>
22#include <cstdint>
23#include <limits>
24#include <stdexcept>
25#include <string_view>
26#include <type_traits>
27#include <utility>
28#include <vector>
29
30#include "Alphabet.cuh"
31#include "device_span.cuh"
32#include "Fastx.hpp"
33#include "hashutil.cuh"
34#include "helpers.cuh"
35
36namespace cusbf {
37
53template <
54 uint16_t K_,
55 uint16_t S_,
56 uint16_t M_,
57 uint64_t HashCount_ = 4,
58 uint64_t CudaBlockSize_ = 256,
59 Alphabet Alphabet_ = DnaAlphabet>
60struct Config {
61 using Alphabet = Alphabet_;
62
63 static constexpr uint16_t k = K_;
64 static constexpr uint16_t m = M_;
65 static constexpr uint16_t s = S_;
66 static constexpr uint64_t hashCount = HashCount_;
67 static constexpr uint64_t alphabetSize = Alphabet::symbolCount;
68 static constexpr uint64_t symbolWidth = Alphabet::symbolWidth;
69 static constexpr uint64_t symbolBits = cuda::std::bit_width(alphabetSize - 1);
70 static constexpr uint64_t symbolMask = (uint64_t{1} << symbolBits) - 1;
71 static constexpr uint64_t filterBlockBits = 256;
72 static constexpr uint64_t cudaBlockSize = CudaBlockSize_;
73
74 static constexpr uint64_t wordBits = 64;
75 static constexpr uint64_t blockWordCount = filterBlockBits / wordBits;
76 static constexpr uint64_t minimizerSpan = k - m + 1;
77 static constexpr uint64_t findereSpan = k - s + 1;
78 static constexpr uint64_t insertGroupSize = blockWordCount;
79 static constexpr uint64_t queryGroupSize = 1;
80 static constexpr uint64_t maxRunKmers = cudaBlockSize;
81
82 static_assert(k > 0, "k must be positive");
83 static_assert(symbolWidth > 0, "alphabet symbolWidth must be positive");
84 static_assert(m > 0 && m <= k, "m must satisfy 0 < m <= k");
85 static_assert(s > 0 && s <= k, "s must satisfy 0 < s <= k");
86 static_assert(k * symbolBits <= 64, "k-mer must fit in one packed uint64_t");
87 static_assert(m * symbolBits <= 64, "m-mer must fit in one packed uint64_t");
88 static_assert(s * symbolBits <= 64, "s-mer must fit in one packed uint64_t");
89 static_assert(hashCount > 0, "At least one Bloom hash is required");
90 static_assert(hashCount <= 16, "This implementation provides 16 multiplicative salts");
91 static_assert(filterBlockBits >= wordBits, "Filter block must contain at least one word");
92 static_assert(
93 cuda::std::has_single_bit(filterBlockBits),
94 "Filter block size must be a power of two"
95 );
96 static_assert(filterBlockBits % wordBits == 0, "Filter block size must align to the word size");
97 static_assert(blockWordCount <= 32, "At most one warp may cooperate on a filter block");
98 static_assert(
99 cuda::std::has_single_bit(blockWordCount),
100 "blockWordCount must be a power of two"
101 );
102 static_assert(insertGroupSize <= 32, "insertGroupSize must fit in one warp");
103 static_assert(queryGroupSize <= 32, "queryGroupSize must fit in one warp");
104 static_assert(
105 cuda::std::has_single_bit(insertGroupSize),
106 "insertGroupSize must be a power of two"
107 );
108 static_assert(
109 cuda::std::has_single_bit(queryGroupSize),
110 "queryGroupSize must be a power of two"
111 );
112 static_assert(
114 "Sectorized layout requires hashCount >= blockWordCount"
115 );
116 static_assert(
118 "Hash count must distribute evenly across shard words"
119 );
120 static_assert(cudaBlockSize % 32 == 0, "CUDA block size must be a multiple of one warp");
121 static_assert(
123 "cudaBlockSize must divide insertGroupSize"
124 );
125 static_assert(cudaBlockSize % queryGroupSize == 0, "cudaBlockSize must divide queryGroupSize");
126};
127
128template <typename Config>
129class Filter;
130
131namespace detail {
132
133template <typename T>
134struct BitwiseOr {
136 return lhs | rhs;
137 }
138};
139
140template <typename Config>
141struct SequenceKmerInput;
142
144
145template <typename Config>
150);
151
152template <typename Config>
154 const char* sequence,
158);
159
160template <typename Config>
164);
165
167inline constexpr uint64_t kInvalidHash = std::numeric_limits<uint64_t>::max();
176template <uint64_t Index>
178
179template <>
180struct SaltLiteral<0> {
181 static constexpr uint64_t value = 0x9E37'79B9'7F4A'7C15ULL;
182};
183template <>
184struct SaltLiteral<1> {
185 static constexpr uint64_t value = 0xC2B2'AE3D'27D4'EB4FULL;
186};
187template <>
188struct SaltLiteral<2> {
189 static constexpr uint64_t value = 0x1656'67B1'9E37'79F9ULL;
190};
191template <>
192struct SaltLiteral<3> {
193 static constexpr uint64_t value = 0x85EB'CA77'C2B2'AE63ULL;
194};
195template <>
196struct SaltLiteral<4> {
197 static constexpr uint64_t value = 0x27D4'EB2F'1656'67C5ULL;
198};
199template <>
200struct SaltLiteral<5> {
201 static constexpr uint64_t value = 0x94D0'49BB'1331'11EFULL;
202};
203template <>
204struct SaltLiteral<6> {
205 static constexpr uint64_t value = 0xBF58'476D'1CE4'E5B9ULL;
206};
207template <>
208struct SaltLiteral<7> {
209 static constexpr uint64_t value = 0xD6E8'FEB8'6659'FD93ULL;
210};
211template <>
212struct SaltLiteral<8> {
213 static constexpr uint64_t value = 0xA076'1D64'78BD'642FULL;
214};
215template <>
216struct SaltLiteral<9> {
217 static constexpr uint64_t value = 0xE703'7ED1'A0B4'28DBULL;
218};
219template <>
220struct SaltLiteral<10> {
221 static constexpr uint64_t value = 0x8EBC'6AF0'9C88'C6E3ULL;
222};
223template <>
224struct SaltLiteral<11> {
225 static constexpr uint64_t value = 0x5899'65CC'7537'4CC3ULL;
226};
227template <>
228struct SaltLiteral<12> {
229 static constexpr uint64_t value = 0x1D8E'4E27'C47D'124FULL;
230};
231template <>
232struct SaltLiteral<13> {
233 static constexpr uint64_t value = 0xEB44'9C93'FBBE'A6B5ULL;
234};
235template <>
236struct SaltLiteral<14> {
237 static constexpr uint64_t value = 0xDB4F'0B91'75AE'2165ULL;
238};
239template <>
240struct SaltLiteral<15> {
241 static constexpr uint64_t value = 0xBBE0'56FD'ADE1'4B91ULL;
242};
243
250template <uint64_t Index>
252 static_assert(Index < 16, "Salt index out of range");
254}
255
257template <typename Config, typename Fn, uint64_t... HashIndices>
259forEachHashIndexImpl(Fn&& fn, std::index_sequence<HashIndices...>) {
260 (fn(std::integral_constant<uint64_t, HashIndices>{}), ...);
261}
262
270template <typename Config, typename Fn>
273 static_cast<Fn&&>(fn), std::make_index_sequence<Config::hashCount>{}
274 );
275}
276
284template <typename Config, uint64_t Length>
286 if constexpr (Length * Config::symbolBits >= 64) {
287 return std::numeric_limits<uint64_t>::max();
288 } else {
289 return (uint64_t{1} << (Config::symbolBits * Length)) - 1;
290 }
291}
292
305template <typename Config, uint64_t WindowLength, uint64_t K>
308 static_assert(WindowLength <= K, "WindowLength must not exceed K");
309 return (packedKmer >> (Config::symbolBits * (K - (start + WindowLength)))) &
311}
312
320 atomicOr(reinterpret_cast<unsigned long long*>(ptr), static_cast<unsigned long long>(value));
321}
322
323} // namespace detail
324
337template <typename Config>
338class Filter {
339 private:
340 struct PreparedRecordRange {
341 uint64_t recordIndex{};
342 uint64_t inputOffset{};
343 uint64_t outputOffset{};
344 uint64_t size{};
345 uint64_t validKmers{};
346 };
347 struct PreparedRecordBatch {
348 std::string sequence;
349 std::vector<PreparedRecordRange> records;
350 };
351 struct FastxChunkAssembly {
352 explicit FastxChunkAssembly(size_t reservedBytes) {
353 sequence.reserve(reservedBytes);
354 }
355
356 void appendRecord(std::string_view recordSequence) {
357 ranges.push_back(
358 RecordRange{
359 static_cast<uint64_t>(sequence.size()),
360 static_cast<uint64_t>(recordSequence.size()),
361 }
362 );
363 sequence.append(recordSequence);
364 }
365
366 [[nodiscard]] bool empty() const {
367 return ranges.empty();
368 }
369
370 [[nodiscard]] bool reachedTarget(size_t targetBytes) const {
371 return sequence.size() >= targetBytes;
372 }
373
374 [[nodiscard]] uint64_t recordCount() const {
375 return static_cast<uint64_t>(ranges.size());
376 }
377
378 void clear() {
379 sequence.clear();
380 ranges.clear();
381 }
382
383 std::string sequence;
384 std::vector<RecordRange> ranges;
385 };
386
387 public:
397 struct alignas(32) Shard {
398 static constexpr uint64_t wordCount = Config::blockWordCount;
399 static constexpr uint64_t wordBits = Config::wordBits;
400 static constexpr int wordBitsLog2 = cuda::std::bit_width(wordBits) - 1;
401 static constexpr uint64_t wordMask = (1ULL << wordBitsLog2) - 1;
402 static constexpr int hashShift = 64 - wordBitsLog2;
403 static constexpr uint64_t sliceWidth = 64 / Config::hashCount;
404 static constexpr bool useBitSlicing = sliceWidth >= wordBitsLog2;
405
406 uint64_t words[wordCount];
407
418 template <uint64_t HashIndex>
419 [[nodiscard]] constexpr __host__ __device__ static uint64_t sectorizedBitAddress(
420 uint64_t baseHash
421 ) {
422 static_assert(HashIndex < Config::hashCount, "Hash index out of range");
423 // When there are enough bits in a 64-bit hash to give each hash
424 // index its own slice, avoid the extra multiply and use
425 // bit-slicing instead.
426 if constexpr (useBitSlicing) {
427 return (baseHash >> (sliceWidth * HashIndex)) & wordMask;
428 } else {
429 const uint64_t mixed = baseHash * detail::multiplicativeSaltLiteral<HashIndex>();
430 return mixed >> hashShift;
431 }
432 }
433
446 __device__ __forceinline__ static void sectorizedHashToMasks(
447 uint64_t baseHash,
448 uint64_t& mask0,
449 uint64_t& mask1,
450 uint64_t& mask2,
451 uint64_t& mask3
452 ) {
454 [&]<uint64_t HashIndex>(std::integral_constant<uint64_t, HashIndex>) {
455 constexpr uint64_t s = HashIndex % Config::blockWordCount;
456 const uint64_t bitPos = sectorizedBitAddress<HashIndex>(baseHash);
457 const uint64_t bit = uint64_t{1} << bitPos;
458 // clang-format off
459 if constexpr (s == 0) mask0 |= bit;
460 else if constexpr (s == 1) mask1 |= bit;
461 else if constexpr (s == 2) mask2 |= bit;
462 else mask3 |= bit;
463 // clang-format on
464 }
465 );
466 }
467 };
468
469 static_assert(Config::blockWordCount == 4, "Filter only supports the fused 256-bit shard path");
470 static_assert(
472 "Fused path expects Theta=1 independent query mapping"
473 );
474 static_assert(
476 "Fused path expects horizontal insert mapping across shard words"
477 );
478
487 explicit Filter(uint64_t requestedFilterBits)
488 : numShards_(
489 cuda::std::bit_ceil(
490 std::max<uint64_t>(
491 1,
492 cuda::ceil_div(requestedFilterBits, Config::filterBlockBits)
493 )
494 )
495 ),
496 filterBits_(numShards_ * Config::filterBlockBits),
497 d_shards_(numShards_) {
498 clear();
499 }
500
501 Filter(const Filter&) = delete;
502 Filter& operator=(const Filter&) = delete;
503 Filter(Filter&&) = default;
504 Filter& operator=(Filter&&) = default;
505 ~Filter() = default;
506
518 [[nodiscard]] uint64_t
519 insertSequence(std::string_view sequence, cuda::stream_ref stream = cudaStream_t{}) {
520 if (recordSymbolCount(sequence.size()) < Config::k) {
521 return 0;
522 }
523
524 const uint64_t totalKmers = recordKmerCount(sequence.size());
525 const auto d_sequence = stagedSequenceView({sequence.data(), sequence.size()}, stream);
526 launchInsertSequence(d_sequence, stream);
527 stream.sync();
528 return totalKmers;
529 }
530
541 [[nodiscard]] uint64_t insertSequenceDevice(
542 device_span<const char> d_sequence,
543 cuda::stream_ref stream = cudaStream_t{}
544 ) {
545 const uint64_t totalKmers = sequenceKmerCount(d_sequence);
546 if (totalKmers == 0) {
547 return 0;
548 }
549
550 launchInsertSequence(d_sequence, stream);
551 return totalKmers;
552 }
553
568 [[nodiscard]] FastxInsertReport
569 insertRecordBatch(RecordBatchView batch, cuda::stream_ref stream = cudaStream_t{}) {
570 const PreparedRecordBatch prepared = prepareRecordBatch(batch);
571 FastxInsertReport report;
572 report.recordsIndexed = prepared.records.size();
573 for (const PreparedRecordRange& record : prepared.records) {
574 report.indexedBases += record.size;
575 report.insertedKmers += record.validKmers;
576 }
577 if (!prepared.sequence.empty()) {
578 (void)insertSequence(prepared.sequence, stream);
579 }
580 return report;
581 }
582
595 [[nodiscard]] FastxInsertReport insertFastx(
596 std::istream& input,
597 double fillFraction = 0.7,
598 cuda::stream_ref stream = cudaStream_t{}
599 ) {
600 return insertFastxStream(input, "<stream>", fillFraction, stream);
601 }
602
608 [[nodiscard]] FastxInsertReport insertFastxFile(
609 std::string_view path,
610 double fillFraction = 0.7,
611 cuda::stream_ref stream = cudaStream_t{}
612 ) {
613 auto input = detail::openFastxFile(path);
614 return insertFastxStream(*input, path, fillFraction, stream);
615 }
616
628 device_span<const char> d_sequence,
629 device_span<uint8_t> d_output,
630 cuda::stream_ref stream = cudaStream_t{}
631 ) const {
632 if (sequenceKmerCount(d_sequence) == 0) {
633 return;
634 }
635
636 launchContainsSequence(d_sequence, d_output, stream);
637 }
638
650 [[nodiscard]] std::vector<uint8_t>
651 containsSequence(std::string_view sequence, cuda::stream_ref stream = cudaStream_t{}) const {
652 if (recordSymbolCount(sequence.size()) < Config::k) {
653 return {};
654 }
655
656 std::vector<uint8_t> output(recordKmerCount(sequence.size()));
657
658 const auto d_sequence = stagedSequenceView({sequence.data(), sequence.size()}, stream);
659 ensureResultCapacity(output.size());
660 launchContainsSequence(
661 d_sequence,
662 device_span<uint8_t>{thrust::raw_pointer_cast(d_resultBuffer_.data()), output.size()},
663 stream
664 );
665 CUSBF_CUDA_CALL(cudaMemcpyAsync(
666 output.data(),
667 thrust::raw_pointer_cast(d_resultBuffer_.data()),
668 output.size() * sizeof(uint8_t),
669 cudaMemcpyDeviceToHost,
670 stream.get()
671 ));
672
673 stream.sync();
674 return output;
675 }
676
691 [[nodiscard]] FastxQueryReport
692 queryRecordBatch(RecordBatchView batch, cuda::stream_ref stream = cudaStream_t{}) const {
693 return queryRecordBatch(batch, [](const RecordQueryView&) {}, stream);
694 }
695
709 template <typename Consumer>
710 [[nodiscard]] FastxQueryReport queryRecordBatch(
711 RecordBatchView batch,
712 Consumer&& consume,
713 cuda::stream_ref stream = cudaStream_t{}
714 ) const {
715 return queryPreparedRecordBatch(prepareRecordBatch(batch), batch.sequence, consume, stream);
716 }
717
723 [[nodiscard]] FastxQueryReport queryFastx(
724 std::istream& input,
725 double fillFraction = 0.7,
726 cuda::stream_ref stream = cudaStream_t{}
727 ) const {
728 return queryFastxStream(input, "<stream>", fillFraction, stream);
729 }
730
736 [[nodiscard]] FastxQueryReport queryFastxFile(
737 std::string_view path,
738 double fillFraction = 0.7,
739 cuda::stream_ref stream = cudaStream_t{}
740 ) const {
741 auto input = detail::openFastxFile(path);
742 return queryFastxStream(*input, path, fillFraction, stream);
743 }
744
758 template <typename Consumer>
759 [[nodiscard]] FastxQueryReport queryFastxRecords(
760 std::istream& input,
761 Consumer&& consume,
762 double fillFraction = 0.7,
763 cuda::stream_ref stream = cudaStream_t{}
764 ) const {
765 return queryFastxRecordsStream(input, "<stream>", consume, fillFraction, stream);
766 }
767
773 template <typename Consumer>
774 [[nodiscard]] FastxQueryReport queryFastxFileRecords(
775 std::string_view path,
776 Consumer&& consume,
777 double fillFraction = 0.7,
778 cuda::stream_ref stream = cudaStream_t{}
779 ) const {
780 auto input = detail::openFastxFile(path);
781 return queryFastxRecordsStream(*input, path, consume, fillFraction, stream);
782 }
783
795 [[nodiscard]] FastxDetailedQueryReport queryFastxDetailed(
796 std::istream& input,
797 double fillFraction = 0.7,
798 cuda::stream_ref stream = cudaStream_t{}
799 ) const {
800 return queryFastxDetailedStream(input, "<stream>", fillFraction, stream);
801 }
802
809 [[nodiscard]] FastxDetailedQueryReport queryFastxFileDetailed(
810 std::string_view path,
811 double fillFraction = 0.7,
812 cuda::stream_ref stream = cudaStream_t{}
813 ) const {
814 auto input = detail::openFastxFile(path);
815 return queryFastxDetailedStream(*input, path, fillFraction, stream);
816 }
817
823 void clear(cuda::stream_ref stream = cudaStream_t{}) {
824 CUSBF_CUDA_CALL(cudaMemsetAsync(
825 thrust::raw_pointer_cast(d_shards_.data()),
826 0,
827 d_shards_.size() * sizeof(Shard),
828 stream.get()
829 ));
830
831 stream.sync();
832 }
833
839 [[nodiscard]] float loadFactor() const {
840 const auto* wordsBegin =
841 reinterpret_cast<const uint64_t*>(thrust::raw_pointer_cast(d_shards_.data()));
842 const uint64_t totalWords = numShards_ * Config::blockWordCount;
843 const uint64_t setBits = thrust::transform_reduce(
844 thrust::device,
845 wordsBegin,
846 wordsBegin + totalWords,
847 [] __device__(uint64_t w) -> uint64_t { return cuda::std::popcount(w); },
848 uint64_t{0},
849 cuda::std::plus<uint64_t>()
850 );
851 return static_cast<float>(setBits) / static_cast<float>(filterBits_);
852 }
853
855 [[nodiscard]] uint64_t filterBits() const {
856 return filterBits_;
857 }
858
860 [[nodiscard]] uint64_t numShards() const {
861 return numShards_;
862 }
863
864 private:
865 uint64_t numShards_{};
866 uint64_t filterBits_{};
867
868 thrust::device_vector<Shard> d_shards_;
869 mutable thrust::device_vector<char> d_sequence_;
870 mutable thrust::device_vector<uint8_t> d_resultBuffer_;
871
873 [[nodiscard]] uint64_t sizeBytes() const {
874 return numShards() * sizeof(Shard);
875 }
876
877 [[nodiscard]] static uint64_t recordSymbolCount(uint64_t bases) {
878 return bases / Config::symbolWidth;
879 }
880
881 [[nodiscard]] static uint64_t recordKmerCount(uint64_t bases) {
882 const uint64_t symbols = recordSymbolCount(bases);
883 return symbols < Config::k ? 0 : symbols - Config::k + 1;
884 }
885
886 [[nodiscard]] static uint64_t validRecordKmerCount(std::string_view sequence) {
887 if (recordSymbolCount(sequence.size()) < Config::k) {
888 return 0;
889 }
890
891 uint64_t invalidSymbols = 0;
892 for (uint64_t i = 0; i < Config::k; ++i) {
893 invalidSymbols += Config::Alphabet::encode(sequence.data() + i * Config::symbolWidth) ==
894 Config::Alphabet::invalidSymbol;
895 }
896
897 uint64_t validKmers = invalidSymbols == 0 ? 1 : 0;
898 for (uint64_t start = 1; start < recordKmerCount(sequence.size()); ++start) {
899 invalidSymbols -=
900 Config::Alphabet::encode(sequence.data() + (start - 1) * Config::symbolWidth) ==
901 Config::Alphabet::invalidSymbol;
902 invalidSymbols += Config::Alphabet::encode(
903 sequence.data() + (start + Config::k - 1) * Config::symbolWidth
904 ) == Config::Alphabet::invalidSymbol;
905 validKmers += invalidSymbols == 0;
906 }
907 return validKmers;
908 }
909
910 static void appendRecordBoundary(std::string& sequence) {
911 const uint64_t remainder = sequence.size() % Config::symbolWidth;
912 if (remainder != 0) {
913 sequence.append(
914 Config::symbolWidth - remainder, static_cast<char>(Config::Alphabet::separator)
915 );
916 }
917 sequence.append(Config::symbolWidth, static_cast<char>(Config::Alphabet::separator));
918 }
919
920 static void validateRecordBatch(RecordBatchView batch) {
921 uint64_t nextOffset = 0;
922 for (const RecordRange& record : batch.records) {
923 if (record.sequenceOffset < nextOffset) {
924 throw std::invalid_argument(
925 "record batch ranges must be ordered and non-overlapping"
926 );
927 }
928 if (record.sequenceOffset > batch.sequence.size() ||
929 record.sequenceBytes > batch.sequence.size() - record.sequenceOffset) {
930 throw std::invalid_argument("record batch range exceeds the source sequence");
931 }
932 if (record.sequenceOffset % Config::symbolWidth != 0 ||
933 record.sequenceBytes % Config::symbolWidth != 0) {
934 throw std::invalid_argument(
935 "record batch ranges must align to the configured alphabet symbol width"
936 );
937 }
938 nextOffset = record.sequenceOffset + record.sequenceBytes;
939 }
940 }
941
942 static void appendPreparedRecord(
943 std::string& output,
944 std::vector<PreparedRecordRange>& ranges,
945 uint64_t recordIndex,
946 uint64_t inputOffset,
947 std::string_view recordSequence
948 ) {
949 if (!output.empty()) {
950 appendRecordBoundary(output);
951 }
952 const uint64_t outputOffset = recordSymbolCount(output.size());
953 output.append(recordSequence);
954 ranges.push_back(
955 PreparedRecordRange{
956 recordIndex,
957 inputOffset,
958 outputOffset,
959 static_cast<uint64_t>(recordSequence.size()),
960 validRecordKmerCount(recordSequence),
961 }
962 );
963 }
964
965 [[nodiscard]] static PreparedRecordBatch prepareRecordBatch(RecordBatchView batch) {
966 validateRecordBatch(batch);
967
968 PreparedRecordBatch prepared;
969 prepared.sequence.reserve(
970 batch.sequence.size() + batch.records.size() * Config::symbolWidth
971 );
972 prepared.records.reserve(batch.records.size());
973
974 for (uint64_t recordIndex = 0; recordIndex < batch.records.size(); ++recordIndex) {
975 const RecordRange& record = batch.records[recordIndex];
976 appendPreparedRecord(
977 prepared.sequence,
978 prepared.records,
979 recordIndex,
980 record.sequenceOffset,
981 batch.sequence.substr(record.sequenceOffset, record.sequenceBytes)
982 );
983 }
984 return prepared;
985 }
986
987 [[nodiscard]] static RecordBatchView
988 makeBatchView(const std::string& sequence, const std::vector<RecordRange>& ranges) {
989 return RecordBatchView{
990 sequence,
991 cuda::std::span<const RecordRange>{ranges.data(), ranges.size()},
992 };
993 }
994
995 static void accumulateInsertReport(FastxInsertReport& total, const FastxInsertReport& chunk) {
996 total.recordsIndexed += chunk.recordsIndexed;
997 total.indexedBases += chunk.indexedBases;
998 total.insertedKmers += chunk.insertedKmers;
999 }
1000
1001 static void accumulateQueryReport(FastxQueryReport& total, const FastxQueryReport& chunk) {
1002 total.recordsQueried += chunk.recordsQueried;
1003 total.queriedBases += chunk.queriedBases;
1004 total.queriedKmers += chunk.queriedKmers;
1005 total.positiveKmers += chunk.positiveKmers;
1006 }
1007
1008 [[nodiscard]] static size_t fastxChunkTargetBytes(double fillFraction) {
1009 size_t freeBytes = 0;
1010 size_t totalBytes = 0;
1011 CUSBF_CUDA_CALL(cudaMemGetInfo(&freeBytes, &totalBytes));
1012 return static_cast<size_t>(static_cast<double>(freeBytes) * fillFraction);
1013 }
1014
1015 template <typename Consumer>
1016 [[nodiscard]] FastxQueryReport queryPreparedRecordBatch(
1017 const PreparedRecordBatch& batch,
1018 std::string_view inputSequence,
1019 Consumer&& consume,
1020 cuda::stream_ref stream
1021 ) const {
1022 FastxQueryReport report;
1023 report.recordsQueried = batch.records.size();
1024 for (const PreparedRecordRange& record : batch.records) {
1025 report.queriedBases += record.size;
1026 report.queriedKmers += record.validKmers;
1027 }
1028
1029 const auto hits = containsSequence(batch.sequence, stream);
1030 for (const PreparedRecordRange& record : batch.records) {
1031 const uint64_t kmers = recordKmerCount(record.size);
1032 const auto sequence = inputSequence.substr(
1033 static_cast<size_t>(record.inputOffset), static_cast<size_t>(record.size)
1034 );
1035 if (kmers == 0) {
1036 consume(
1037 RecordQueryView{
1038 record.recordIndex,
1039 sequence,
1040 record.size,
1041 record.validKmers,
1042 0,
1043 cuda::std::span<const uint8_t>{},
1044 }
1045 );
1046 continue;
1047 }
1048
1049 const auto* hitBegin = hits.data() + static_cast<ptrdiff_t>(record.outputOffset);
1050 const auto hitSpan =
1051 cuda::std::span<const uint8_t>{hitBegin, static_cast<size_t>(kmers)};
1052 const auto positiveKmers =
1053 static_cast<uint64_t>(std::count(hitSpan.begin(), hitSpan.end(), uint8_t{1}));
1054 report.positiveKmers += positiveKmers;
1055 consume(
1056 RecordQueryView{
1057 record.recordIndex,
1058 sequence,
1059 record.size,
1060 record.validKmers,
1061 positiveKmers,
1062 hitSpan,
1063 }
1064 );
1065 }
1066 return report;
1067 }
1068
1070 [[nodiscard]] FastxInsertReport insertFastxStream(
1071 std::istream& input,
1072 std::string_view sourceName,
1073 double fillFraction,
1074 cuda::stream_ref stream
1075 ) {
1076 detail::FastxReader reader(input, sourceName);
1077 detail::FastxRecord record;
1078 FastxInsertReport report;
1079
1080 const auto chunkTargetBytes = fastxChunkTargetBytes(fillFraction);
1081 FastxChunkAssembly chunk(chunkTargetBytes);
1082
1083 auto flush = [&]() {
1084 if (chunk.empty()) {
1085 return;
1086 }
1087 accumulateInsertReport(
1088 report, insertRecordBatch(makeBatchView(chunk.sequence, chunk.ranges), stream)
1089 );
1090 chunk.clear();
1091 };
1092
1093 while (reader.nextRecord(record)) {
1094 chunk.appendRecord(record.sequence);
1095 if (chunk.reachedTarget(chunkTargetBytes)) {
1096 flush();
1097 }
1098 }
1099
1100 flush();
1101 return report;
1102 }
1103
1105 [[nodiscard]] FastxQueryReport queryFastxStream(
1106 std::istream& input,
1107 std::string_view sourceName,
1108 double fillFraction,
1109 cuda::stream_ref stream
1110 ) const {
1111 return queryFastxRecordsStream(
1112 input, sourceName, [](const FastxRecordView&) {}, fillFraction, stream
1113 );
1114 }
1115
1116 template <typename Consumer>
1117 [[nodiscard]] FastxQueryReport queryFastxRecordsStream(
1118 std::istream& input,
1119 std::string_view sourceName,
1120 Consumer&& consume,
1121 double fillFraction,
1122 cuda::stream_ref stream
1123 ) const {
1124 detail::FastxReader reader(input, sourceName);
1125 detail::FastxRecord record;
1126 FastxQueryReport report;
1127
1128 const auto chunkTargetBytes = fastxChunkTargetBytes(fillFraction);
1129 FastxChunkAssembly chunk(chunkTargetBytes);
1130 std::vector<detail::FastxRecord> records;
1131 uint64_t recordIndexBase = 0;
1132
1133 auto flush = [&]() {
1134 if (chunk.empty()) {
1135 return;
1136 }
1137 const FastxQueryReport chunkReport = queryRecordBatch(
1138 makeBatchView(chunk.sequence, chunk.ranges),
1139 [&](const RecordQueryView& recordView) {
1140 const detail::FastxRecord& fastxRecord =
1141 records[static_cast<size_t>(recordView.recordIndex)];
1142 consume(
1143 FastxRecordView{
1144 recordIndexBase + recordView.recordIndex,
1145 fastxRecord.header,
1146 fastxRecord.sequence,
1147 recordView.queriedBases,
1148 recordView.queriedKmers,
1149 recordView.positiveKmers,
1150 recordView.hits,
1151 }
1152 );
1153 },
1154 stream
1155 );
1156 accumulateQueryReport(report, chunkReport);
1157 recordIndexBase += chunk.recordCount();
1158 chunk.clear();
1159 records.clear();
1160 };
1161
1162 while (reader.nextRecord(record)) {
1163 chunk.appendRecord(record.sequence);
1164 records.push_back(std::move(record));
1165 if (chunk.reachedTarget(chunkTargetBytes)) {
1166 flush();
1167 }
1168 }
1169
1170 flush();
1171 return report;
1172 }
1173
1175 [[nodiscard]] FastxDetailedQueryReport queryFastxDetailedStream(
1176 std::istream& input,
1177 std::string_view sourceName,
1178 double fillFraction,
1179 cuda::stream_ref stream
1180 ) const {
1181 FastxDetailedQueryReport report;
1182 report.summary = queryFastxRecordsStream(
1183 input,
1184 sourceName,
1185 [&report](const FastxRecordView& record) {
1186 report.records.push_back(
1187 FastxDetailedQueryRecord{
1188 record.recordIndex,
1189 std::string(record.header),
1190 std::string(record.sequence),
1191 record.queriedBases,
1192 record.queriedKmers,
1193 record.positiveKmers,
1194 std::vector<uint8_t>(record.hits.begin(), record.hits.end()),
1195 }
1196 );
1197 },
1198 fillFraction,
1199 stream
1200 );
1201 return report;
1202 }
1203
1208 void ensureSequenceCapacity(uint64_t bases) const {
1209 if (bases > d_sequence_.size()) {
1210 d_sequence_.resize(bases);
1211 }
1212 }
1213
1218 void ensureResultCapacity(uint64_t kmers) const {
1219 if (kmers > d_resultBuffer_.size()) {
1220 d_resultBuffer_.resize(kmers);
1221 }
1222 }
1223
1229 void stageSequence(cuda::std::span<const char> sequence, cuda::stream_ref stream) const {
1230 ensureSequenceCapacity(sequence.size());
1231 CUSBF_CUDA_CALL(cudaMemcpyAsync(
1232 thrust::raw_pointer_cast(d_sequence_.data()),
1233 sequence.data(),
1234 sequence.size_bytes(),
1235 cudaMemcpyHostToDevice,
1236 stream.get()
1237 ));
1238 }
1239
1240 [[nodiscard]] static uint64_t sequenceKmerCount(device_span<const char> d_sequence) {
1241 return detail::SequenceKmerInput<Config>{d_sequence}.kmerCount();
1242 }
1243
1244 [[nodiscard]] device_span<const char>
1245 stagedSequenceView(cuda::std::span<const char> sequence, cuda::stream_ref stream) const {
1246 stageSequence(sequence, stream);
1247 return device_span<const char>{
1248 thrust::raw_pointer_cast(d_sequence_.data()), sequence.size()
1249 };
1250 }
1251
1257 void launchInsertSequence(device_span<const char> d_sequence, cuda::stream_ref stream) {
1258 const auto input = detail::SequenceKmerInput<Config>{d_sequence};
1259 const uint64_t numKmers = input.kmerCount();
1260 if (numKmers == 0) {
1261 return;
1262 }
1263 const uint64_t gridSize = cuda::ceil_div(numKmers, Config::cudaBlockSize);
1264
1266 <<<gridSize, Config::cudaBlockSize, 0, stream.get()>>>(
1267 input, device_span<Shard>{thrust::raw_pointer_cast(d_shards_.data()), numShards_}
1268 );
1269 CUSBF_CUDA_CALL(cudaGetLastError());
1270 }
1271
1278 void launchContainsSequence(
1279 device_span<const char> d_sequence,
1280 device_span<uint8_t> d_output,
1281 cuda::stream_ref stream
1282 ) const {
1283 const auto input = detail::SequenceKmerInput<Config>{d_sequence};
1284 const uint64_t numKmers = input.kmerCount();
1285 const uint64_t gridSize =
1286 cuda::ceil_div(numKmers, Config::cudaBlockSize * detail::kContainsSequenceStride);
1287
1289 <<<gridSize, Config::cudaBlockSize, 0, stream.get()>>>(
1290 input,
1291 device_span<const Shard>{thrust::raw_pointer_cast(d_shards_.data()), numShards_},
1292 d_output
1293 );
1294 CUSBF_CUDA_CALL(cudaGetLastError());
1295 }
1296};
1297
1298namespace detail {
1299
1306template <typename Config>
1309
1310 [[nodiscard]] constexpr __host__ __device__ uint64_t kmerCount() const {
1311 const uint64_t symbols = sequence.size() / Config::symbolWidth;
1312 return symbols < Config::k ? 0 : (symbols - Config::k + 1);
1313 }
1314
1315 [[nodiscard]] constexpr __host__ __device__ uint64_t smerCount() const {
1316 const uint64_t symbols = sequence.size() / Config::symbolWidth;
1317 return symbols < Config::s ? 0 : (symbols - Config::s + 1);
1318 }
1319};
1320
1331template <typename Config>
1332[[nodiscard]] __device__ __forceinline__ uint64_t packedKmerMinimizerHash(uint64_t packedKmer) {
1333 uint64_t minimizerHash = kInvalidHash;
1334 _Pragma("unroll")
1335 for (uint64_t offset = 0; offset < Config::minimizerSpan; ++offset) {
1336 const uint64_t packedMmer =
1337 extractPackedSubwindow<Config, Config::m, Config::k>(packedKmer, offset);
1338 minimizerHash = min(minimizerHash, detail::minimizerHash64(packedMmer));
1339 }
1340 return minimizerHash;
1341}
1342
1351template <typename Config>
1352[[nodiscard]] __device__ __forceinline__ uint64_t
1353packedKmerSmerHash(uint64_t packedKmer, uint64_t start) {
1354 const uint64_t packedSmer =
1355 extractPackedSubwindow<Config, Config::s, Config::k>(packedKmer, start);
1356 return detail::hash64(packedSmer);
1357}
1358
1370template <typename Config>
1371__device__ __forceinline__ void
1372loadShardWords4(const typename Filter<Config>::Shard* shards, uint64_t shardIndex, uint64_t* w) {
1373#if __CUDA_ARCH__ >= 1000
1374 detail::load256BitGlobalNC(shards[shardIndex].words, w[0], w[1], w[2], w[3]);
1375#else
1376 detail::load128BitGlobalNC(shards[shardIndex].words + 0, w[0], w[1]);
1377 detail::load128BitGlobalNC(shards[shardIndex].words + 2, w[2], w[3]);
1378#endif
1379}
1380
1389template <typename Config, uint64_t K>
1390__device__ __forceinline__ uint64_t packKmerFromTile(const uint8_t* tile, uint64_t start) {
1391 uint64_t packed = 0;
1392 _Pragma("unroll")
1393 for (uint64_t i = 0; i < K; ++i) {
1394 packed = (packed << Config::symbolBits) | (tile[start + i] & Config::symbolMask);
1395 }
1396 return packed;
1397}
1398
1410template <typename Config, uint64_t K>
1411__device__ __forceinline__ uint64_t advancePackedKmer(uint64_t packed, uint8_t newBase) {
1412 return ((packed << Config::symbolBits) | (newBase & Config::symbolMask)) &
1413 packedWindowMask<Config, K>();
1414}
1415
1427template <typename Config>
1428__device__ __forceinline__ bool
1429sectorizedContainsPackedKmer(uint64_t packedKmer, const uint64_t* w) {
1430 bool present = true;
1431 _Pragma("unroll")
1432 for (uint64_t smerOffset = 0; smerOffset < Config::findereSpan; ++smerOffset) {
1433 const uint64_t smerHash = packedKmerSmerHash<Config>(packedKmer, smerOffset);
1434 detail::forEachHashIndex<Config>(
1435 [&]<uint64_t HashIndex>(std::integral_constant<uint64_t, HashIndex>) {
1436 constexpr uint64_t s = HashIndex % Config::blockWordCount;
1437 const uint64_t bitPos =
1438 Filter<Config>::Shard::template sectorizedBitAddress<HashIndex>(smerHash);
1439 present &= ((w[s] >> bitPos) & 1) != 0;
1440 }
1441 );
1442 }
1443 return present;
1444}
1445
1446template <typename Config>
1447__device__ __forceinline__ bool kmerIsValid(const uint8_t* tile, uint64_t start) {
1448 _Pragma("unroll")
1449 for (uint64_t i = 0; i < Config::k; ++i) {
1450 if (tile[start + i] == Config::Alphabet::invalidSymbol) {
1451 return false;
1452 }
1453 }
1454 return true;
1455}
1456
1471template <typename Config>
1472__device__ __forceinline__ bool prepareSequenceHashTiles(
1473 const char* sequence,
1474 uint64_t blockStartKmer,
1475 uint64_t blockKmers,
1476 uint8_t* sequenceTile
1477) {
1478 const uint64_t tileBases = blockKmers + Config::k - 1;
1479
1480 bool localInvalidBase = false;
1481 for (uint64_t idx = threadIdx.x; idx < tileBases; idx += Config::cudaBlockSize) {
1482 const uint8_t encodedBase =
1483 Config::Alphabet::encode(sequence + (blockStartKmer + idx) * Config::symbolWidth);
1484 sequenceTile[idx] = encodedBase;
1485 localInvalidBase |= (encodedBase == Config::Alphabet::invalidSymbol);
1486 }
1487 return __syncthreads_count(localInvalidBase) == 0;
1488}
1489
1502template <typename Config>
1503__global__ __launch_bounds__(Config::cudaBlockSize, 6) void containsSequenceKmersKernel(
1505 device_span<const typename Filter<Config>::Shard> shards,
1506 device_span<uint8_t> output
1507) {
1508 // Each thread handles this many consecutive k-mers to amortise packing
1509 constexpr uint32_t kStride = kContainsSequenceStride;
1510 constexpr uint64_t sequenceTileBases = Config::cudaBlockSize * kStride + Config::k - 1;
1511
1512 __shared__ uint8_t sequenceTile[sequenceTileBases];
1513
1514 const uint64_t numKmers = input.kmerCount();
1515 const uint64_t blockStartKmer =
1516 static_cast<uint64_t>(blockIdx.x) * Config::cudaBlockSize * kStride;
1517 if (blockStartKmer >= numKmers) {
1518 return;
1519 }
1520
1521 const uint64_t blockKmers = min(Config::cudaBlockSize * kStride, numKmers - blockStartKmer);
1522
1523 const bool blockAllValid = prepareSequenceHashTiles<Config>(
1524 input.sequence.data(), blockStartKmer, blockKmers, sequenceTile
1525 );
1526
1527 const uint64_t threadOffset = static_cast<uint64_t>(threadIdx.x) * kStride;
1528 if (threadOffset >= blockKmers) {
1529 return;
1530 }
1531
1532 // Bitmask: bit s set = k-mer at offset s is valid.
1533 uint32_t kmerValidMask = 0;
1534 _Pragma("unroll")
1535 for (uint32_t s = 0; s < kStride; ++s) {
1536 if ((threadOffset + s) < blockKmers) {
1537 kmerValidMask |= (1u << s);
1538 }
1539 }
1540
1541 if (!blockAllValid) {
1542 _Pragma("unroll")
1543 for (uint32_t s = 0; s < kStride; ++s) {
1544 if (!(kmerValidMask & (1u << s))) {
1545 continue;
1546 }
1547 const uint64_t localIdx = threadOffset + s;
1548 if (!kmerIsValid<Config>(sequenceTile, localIdx)) {
1549 kmerValidMask &= ~(1u << s);
1550 }
1551 }
1552 }
1553
1554 // Always pack from position 0. Sliding propagates the packed value forward
1555 // invalid bases from earlier k-mers are simply shifted out.
1556 uint64_t packedKmer = packKmerFromTile<Config, Config::k>(sequenceTile, threadOffset);
1557
1558 for (uint32_t s = 0; s < kStride; ++s) {
1559 const uint64_t localIdx = threadOffset + s;
1560 if (localIdx >= blockKmers) {
1561 break;
1562 }
1563
1564 const uint64_t kmerIndex = blockStartKmer + localIdx;
1565
1566 if (s > 0) {
1567 packedKmer = advancePackedKmer<Config, Config::k>(
1568 packedKmer, sequenceTile[localIdx + Config::k - 1]
1569 );
1570 }
1571
1572 if (!(kmerValidMask & (1u << s))) {
1573 output[kmerIndex] = 0;
1574 continue;
1575 }
1576
1577 const uint64_t minimizerHash = packedKmerMinimizerHash<Config>(packedKmer);
1578
1579 // Warp-level shard sharing.
1580 const auto shardIdx = static_cast<uint32_t>(minimizerHash & (shards.size() - 1));
1581 const uint32_t peers = __match_any_sync(0xFFFFFFFFu, shardIdx);
1582 const int leader = __ffs(static_cast<int>(peers)) - 1;
1583
1584 uint64_t w[4];
1585 if (static_cast<int>(threadIdx.x & 31u) == leader) {
1586 loadShardWords4<Config>(shards.data(), shardIdx, w);
1587 }
1588 w[0] = __shfl_sync(peers, w[0], leader);
1589 w[1] = __shfl_sync(peers, w[1], leader);
1590 w[2] = __shfl_sync(peers, w[2], leader);
1591 w[3] = __shfl_sync(peers, w[3], leader);
1592
1593 const bool present = sectorizedContainsPackedKmer<Config>(packedKmer, w);
1594 output[kmerIndex] = present;
1595 }
1596}
1597
1609template <typename Config>
1612 device_span<typename Filter<Config>::Shard> shards
1613) {
1614 constexpr uint64_t sequenceTileBases = Config::cudaBlockSize + Config::k - 1;
1615 constexpr uint32_t warpSize = 32;
1616 constexpr uint32_t warpsPerBlock = Config::cudaBlockSize / warpSize;
1617
1618 using WarpReduceWord = cub::WarpReduce<uint64_t>;
1619
1620 __shared__ uint8_t sequenceTile[sequenceTileBases];
1621 __shared__ typename WarpReduceWord::TempStorage reduceStorage[warpsPerBlock][4];
1622
1623 const uint64_t numKmers = input.kmerCount();
1624 const uint64_t blockStartKmer = static_cast<uint64_t>(blockIdx.x) * Config::cudaBlockSize;
1625 if (blockStartKmer >= numKmers) {
1626 return;
1627 }
1628
1629 const uint64_t blockKmers = min(Config::cudaBlockSize, numKmers - blockStartKmer);
1630 const auto localKmerIndex = static_cast<uint64_t>(threadIdx.x);
1631 const bool inRange = localKmerIndex < blockKmers;
1632
1633 const bool blockAllValid = prepareSequenceHashTiles<Config>(
1634 input.sequence.data(), blockStartKmer, blockKmers, sequenceTile
1635 );
1636
1637 // Avoid early returns so all warp lanes can participate in the segmented
1638 // warp reductions below.
1639 bool active = inRange;
1640
1641 if (active && !blockAllValid) {
1642 active = kmerIsValid<Config>(sequenceTile, localKmerIndex);
1643 }
1644
1645 // Inactive threads keep zero masks and a per-lane sentinel shard index so
1646 // contiguous run detection naturally splits around them.
1647 uint64_t minimizerHash = 0;
1648 uint64_t wordMask0 = 0;
1649 uint64_t wordMask1 = 0;
1650 uint64_t wordMask2 = 0;
1651 uint64_t wordMask3 = 0;
1652
1653 if (active) {
1654 const uint64_t packedKmer =
1655 packKmerFromTile<Config, Config::k>(sequenceTile, localKmerIndex);
1656 minimizerHash = packedKmerMinimizerHash<Config>(packedKmer);
1657
1658 uint64_t h_s = packedKmerSmerHash<Config>(packedKmer, 0);
1660 h_s, wordMask0, wordMask1, wordMask2, wordMask3
1661 );
1662 _Pragma("unroll")
1663 for (uint64_t smerOffset = 1; smerOffset < Config::findereSpan; ++smerOffset) {
1664 h_s = packedKmerSmerHash<Config>(packedKmer, smerOffset);
1666 h_s, wordMask0, wordMask1, wordMask2, wordMask3
1667 );
1668 }
1669 }
1670
1671 // Warp-local segmented reductions: contiguous threads sharing the same
1672 // shard merge their masks so only the run head issues the atomicOrs.
1673 const auto shardIdx =
1674 static_cast<uint32_t>(active ? (minimizerHash & (shards.size() - 1)) : ~threadIdx.x);
1675
1676 const uint32_t lane = threadIdx.x & (warpSize - 1);
1677 const uint32_t warpIdx = threadIdx.x / warpSize;
1678 const uint32_t prevShardIdx = __shfl_up_sync(0xffffffff, shardIdx, 1);
1679 const bool runHead = (lane == 0) || (shardIdx != prevShardIdx);
1680 const BitwiseOr<uint64_t> bitwiseOr{};
1681
1682 wordMask0 = WarpReduceWord(reduceStorage[warpIdx][0])
1683 .HeadSegmentedReduce(wordMask0, runHead, bitwiseOr);
1684 wordMask1 = WarpReduceWord(reduceStorage[warpIdx][1])
1685 .HeadSegmentedReduce(wordMask1, runHead, bitwiseOr);
1686 wordMask2 = WarpReduceWord(reduceStorage[warpIdx][2])
1687 .HeadSegmentedReduce(wordMask2, runHead, bitwiseOr);
1688 wordMask3 = WarpReduceWord(reduceStorage[warpIdx][3])
1689 .HeadSegmentedReduce(wordMask3, runHead, bitwiseOr);
1690
1691 if (runHead && active) {
1692 auto& shard = shards[shardIdx];
1693 if (wordMask0 != 0) {
1694 atomicOrWord(&shard.words[0], wordMask0);
1695 }
1696 if (wordMask1 != 0) {
1697 atomicOrWord(&shard.words[1], wordMask1);
1698 }
1699 if (wordMask2 != 0) {
1700 atomicOrWord(&shard.words[2], wordMask2);
1701 }
1702 if (wordMask3 != 0) {
1703 atomicOrWord(&shard.words[3], wordMask3);
1704 }
1705 }
1706}
1707
1708} // namespace detail
1709
1710} // namespace cusbf
cuSBF GPU-accelerated sectorized Bloom filter.
FastxInsertReport insertFastxFile(std::string_view path, double fillFraction=0.7, cuda::stream_ref stream=cudaStream_t{})
Inserts all k-mers from a FASTA/FASTQ file via chunked streaming.
FastxDetailedQueryReport queryFastxDetailed(std::istream &input, double fillFraction=0.7, cuda::stream_ref stream=cudaStream_t{}) const
Queries all k-mers from a FASTA/FASTQ input stream via chunked streaming and preserves per-record hit...
FastxDetailedQueryReport queryFastxFileDetailed(std::string_view path, double fillFraction=0.7, cuda::stream_ref stream=cudaStream_t{}) const
Queries all k-mers from a FASTA/FASTQ file via chunked streaming and preserves per-record hit vectors...
uint64_t filterBits() const
Returns the total allocated capacity of the filter in bits.
Filter(uint64_t requestedFilterBits)
Constructs a Filter with at least requestedFilterBits bits of storage.
float loadFactor() const
Computes the fraction of set bits in the filter.
void containsSequenceDevice(device_span< const char > d_sequence, device_span< uint8_t > d_output, cuda::stream_ref stream=cudaStream_t{}) const
Async query of k-mers from a device-resident sequence.
void clear(cuda::stream_ref stream=cudaStream_t{})
Resets all filter bits to zero and synchronises the stream.
Filter(const Filter &)=delete
Filter & operator=(const Filter &)=delete
FastxQueryReport queryFastxRecords(std::istream &input, Consumer &&consume, double fillFraction=0.7, cuda::stream_ref stream=cudaStream_t{}) const
Queries a FASTA/FASTQ stream and emits one record result per parsed record.
FastxInsertReport insertRecordBatch(RecordBatchView batch, cuda::stream_ref stream=cudaStream_t{})
Inserts a dense host-resident record batch.
std::vector< uint8_t > containsSequence(std::string_view sequence, cuda::stream_ref stream=cudaStream_t{}) const
Queries all valid k-mers from a host-resident sequence.
FastxQueryReport queryRecordBatch(RecordBatchView batch, Consumer &&consume, cuda::stream_ref stream=cudaStream_t{}) const
Queries a dense host-resident record batch and streams per-record results.
uint64_t insertSequenceDevice(device_span< const char > d_sequence, cuda::stream_ref stream=cudaStream_t{})
Async insert of k-mers from a device-resident sequence.
FastxQueryReport queryFastx(std::istream &input, double fillFraction=0.7, cuda::stream_ref stream=cudaStream_t{}) const
Queries all k-mers from a FASTA/FASTQ input stream via chunked streaming.
Filter(Filter &&)=default
uint64_t numShards() const
Returns the number of shards.
FastxQueryReport queryRecordBatch(RecordBatchView batch, cuda::stream_ref stream=cudaStream_t{}) const
Queries a dense host-resident record batch and returns aggregate counts.
uint64_t insertSequence(std::string_view sequence, cuda::stream_ref stream=cudaStream_t{})
Inserts all valid k-mers from a host-resident sequence.
FastxQueryReport queryFastxFile(std::string_view path, double fillFraction=0.7, cuda::stream_ref stream=cudaStream_t{}) const
Queries all k-mers from a FASTA/FASTQ file via chunked streaming.
Filter & operator=(Filter &&)=default
~Filter()=default
FastxInsertReport insertFastx(std::istream &input, double fillFraction=0.7, cuda::stream_ref stream=cudaStream_t{})
Inserts all k-mers from a FASTA/FASTQ input stream.
FastxQueryReport queryFastxFileRecords(std::string_view path, Consumer &&consume, double fillFraction=0.7, cuda::stream_ref stream=cudaStream_t{}) const
Queries a FASTA/FASTQ file and emits one record result per parsed record.
Concept for alphabet-like types used to encode bytes as symbol indices.
Definition Alphabet.cuh:95
#define CUSBF_CUDA_CALL(err)
Macro for checking CUDA errors.
Definition helpers.cuh:132
__device__ __forceinline__ uint64_t packedKmerMinimizerHash(uint64_t packedKmer)
Computes the minimizer hash for a packed k-mer.
__device__ __forceinline__ bool kmerIsValid(const uint8_t *tile, uint64_t start)
__host__ __device__ __forceinline__ void forEachHashIndexImpl(Fn &&fn, std::index_sequence< HashIndices... >)
Implementation helper for forEachHashIndex (fold-expression over an index sequence).
__host__ __device__ __forceinline__ void forEachHashIndex(Fn &&fn)
Invokes fn for each hash index in [0, Config::hashCount) at compile time.
__device__ __forceinline__ bool sectorizedContainsPackedKmer(uint64_t packedKmer, const uint64_t *w)
Tests whether a packed k-mer is present in a pre-loaded shard.
__device__ __forceinline__ uint64_t packedKmerSmerHash(uint64_t packedKmer, uint64_t start)
Computes the hash for the s-mer at position start within a packed k-mer.
__device__ __forceinline__ bool prepareSequenceHashTiles(const char *sequence, uint64_t blockStartKmer, uint64_t blockKmers, uint8_t *sequenceTile)
Cooperatively loads and encodes a tile of symbols into shared memory.
__host__ __device__ __forceinline__ constexpr uint64_t extractPackedSubwindow(uint64_t packedKmer, uint64_t start)
Extracts a packed sub-window from a packed k-mer.
constexpr uint32_t kContainsSequenceStride
__global__ void containsSequenceKmersKernel(SequenceKmerInput< Config > input, device_span< const typename Filter< Config >::Shard > shards, device_span< uint8_t > output)
CUDA kernel: queries k-mers from a sequence against the filter.
__device__ __forceinline__ uint64_t advancePackedKmer(uint64_t packed, uint8_t newBase)
Slides the packed k-mer window forward by one symbol.
__device__ __forceinline__ void atomicOrWord(uint64_t *ptr, uint64_t value)
Atomically ORs value into the device word at ptr.
__device__ __forceinline__ void loadShardWords4(const typename Filter< Config >::Shard *shards, uint64_t shardIndex, uint64_t *w)
Loads all four 64-bit words of a shard into a local array.
__device__ __forceinline__ uint64_t packKmerFromTile(const uint8_t *tile, uint64_t start)
Packs K symbols from a shared-memory tile into an integer.
__global__ void insertSequenceKmersKernel(SequenceKmerInput< Config > input, device_span< typename Filter< Config >::Shard > shards)
CUDA kernel: inserts k-mers from a sequence into the filter.
constexpr uint64_t kInvalidHash
Sentinel hash value indicating "no valid minimizer found".
__host__ __device__ __forceinline__ constexpr uint64_t multiplicativeSaltLiteral()
Returns the multiplicative salt constant for hash function Index.
consteval bool separatorPositionAlwaysEncodesInvalid(char *input, uint64_t separatorPosition, uint64_t index)
Recursively tests whether placing the separator byte at any position in an input of valid bytes alway...
Definition Alphabet.cuh:37
__host__ __device__ __forceinline__ constexpr uint64_t packedWindowMask()
Returns a bitmask covering Length packed alphabet symbols.
Compile-time configuration for a cusbf::Filter.
static constexpr uint64_t symbolWidth
static constexpr uint64_t blockWordCount
static constexpr uint64_t alphabetSize
static constexpr uint16_t s
static constexpr uint64_t hashCount
static constexpr uint16_t m
static constexpr uint64_t symbolMask
static constexpr uint64_t minimizerSpan
static constexpr uint64_t cudaBlockSize
static constexpr uint64_t findereSpan
static constexpr uint64_t maxRunKmers
static constexpr uint64_t wordBits
static constexpr uint16_t k
static constexpr uint64_t insertGroupSize
static constexpr uint64_t filterBlockBits
static constexpr uint64_t symbolBits
static constexpr uint64_t queryGroupSize
One 256-bit filter block stored as an array of Config::blockWordCount words.
static constexpr int hashShift
uint64_t words[wordCount]
static constexpr uint64_t sliceWidth
static constexpr int wordBitsLog2
static constexpr bool useBitSlicing
__device__ static __forceinline__ void sectorizedHashToMasks(uint64_t baseHash, uint64_t &mask0, uint64_t &mask1, uint64_t &mask2, uint64_t &mask3)
Computes four word bitmasks from a single base hash.
static constexpr uint64_t wordBits
static constexpr uint64_t wordMask
constexpr __host__ static __device__ uint64_t sectorizedBitAddress(uint64_t baseHash)
Maps a base hash to a bit position within word sector HashIndex.
static constexpr uint64_t wordCount
__host__ __device__ __forceinline__ T operator()(T lhs, T rhs) const
Compile-time golden-ratio-derived multiplicative salt constants.
Kernel input descriptor for a sequence k-mer sweep.
constexpr __host__ __device__ uint64_t kmerCount() const
constexpr __host__ __device__ uint64_t smerCount() const
device_span< const char > sequence
A span that is assumed to point to device-accessible memory.