Namespaces
namespace	xxhash
	XXHash_64 implementation from.

Classes
struct	BitwiseOr
	Device functor for bitwise OR reduction (CUB `WarpReduce`). More...

class	ChunkStreamPair
	Two non-blocking CUDA streams for overlapping chunk H2D and kernel work. More...

struct	cuda_free_memory
	Driver-reported free device memory (bytes available for new allocations). More...

struct	DensePackedKmerInput
	Device-side view of a dense packed symbol sequence. More...

class	FastxBufferReader
	FASTA/FASTQ parser over a contiguous in-memory buffer. More...

class	FastxFileBuffer
	Read-only contiguous file payload for in-memory FASTX parsing. More...

class	FastxPinnedSequenceBuffer
	Page-locked host buffer for fused FASTX normalize + H2D staging. More...

class	FastxPipelineReleaseGuard

class	FastxPipelineState

class	FastxReader
	Streaming FASTA/FASTQ parser. More...

struct	FastxRecord
	A single sequence record extracted from a FASTA/FASTQ stream. More...

class	GzIstream
	`std::istream` adapter for GzStreambuf. More...

class	GzStreambuf
	`std::streambuf` over a gzip file (zlib `gzread`). More...

class	QueryLayout

struct	QueryLayoutRecord

struct	SaltLiteral

struct	SaltLiteral< 0 >

struct	SaltLiteral< 1 >

struct	SaltLiteral< 10 >

struct	SaltLiteral< 11 >

struct	SaltLiteral< 12 >

struct	SaltLiteral< 13 >

struct	SaltLiteral< 14 >

struct	SaltLiteral< 15 >

struct	SaltLiteral< 2 >

struct	SaltLiteral< 3 >

struct	SaltLiteral< 4 >

struct	SaltLiteral< 5 >

struct	SaltLiteral< 6 >

struct	SaltLiteral< 7 >

struct	SaltLiteral< 8 >

struct	SaltLiteral< 9 >

struct	SequenceKmerInput
	Device-side view of an encoded sequence for k-mer / s-mer counting. More...

Concepts
concept	cusbf_result_like

concept	fastx_dispatch_handler
	Handler invoked by dispatch_fastx_file with either reader type and a dispatch path.

Typedefs
template<typename Handler >
using	fastx_dispatch_handler_result_t = std::invoke_result_t< Handler &, FastxReader &, fastx_dispatch_path >
	Return type of a fastx_dispatch_handler when invoked with a stream reader.

Enumerations
enum class	fastx_chunk_mode { insert , query }
	FASTX mode used to size GPU staging buffers. More...

enum class	fastx_dispatch_path { single_chunk_stream , single_chunk_mmap , chunked_mmap , chunked_stream }
	How a FASTX file is read and chunked for GPU processing. More...

enum class	FastxFormat : uint8_t { unknown , fasta , fastq }
	Detected file format for a FASTA/FASTQ stream. More...

Functions
template<typename T >
consteval uint64_t	validByteCount ()

template<typename T >
consteval bool	separatorPositionAlwaysEncodesInvalid (char *input, uint64_t separatorPosition, uint64_t index)
	Recursively tests whether placing the separator byte at any position in an input of valid bytes always results in an invalid encoding.

template<typename T >
consteval bool	separatorByteAlwaysEncodesInvalid ()
	Tests that for every position in the input, placing the separator byte at that position always results in an invalid encoding.

template<typename Config >
__global__ void	count_positive_kmers_per_record_kernel (const uint8_t hits, const QueryLayoutRecord records, uint64_t *positive_kmers_out, uint64_t record_count)
	Per-record kernel: sums `hits`[hit_offset ..] for each QueryLayoutRecord.

template<typename Config >
uint64_t	count_positive_kmers_total (device_span< const uint8_t > hits, cuda::stream_ref stream)
	Device-wide count of set bits in a per-k-mer hit buffer.

template<typename Config >
Result< void >	count_positive_kmers_per_record (device_span< const uint8_t > hits, device_span< const QueryLayoutRecord > records, device_span< uint64_t > positive_kmers_out, cuda::stream_ref stream)
	Fills `positive_kmers_out` with per-record positive k-mer counts.

template<typename Config >
constexpr uint64_t	dense_packed_symbols_per_word ()
	Encoded symbols stored in each `uint64_t` word for `Config`.

template<typename Config >
constexpr uint64_t	dense_packed_word_count (uint64_t num_symbols)
	Returns the number of `uint64_t` words required for `num_symbols` encoded symbols.

template<typename Config >
constexpr uint64_t	dense_packed_kmer_count (uint64_t num_symbols)
	Returns the number of k-mer windows in a dense packed symbol sequence.

template<typename Config >
__device__ __forceinline__ uint8_t	dense_packed_symbol_at (const uint64_t *words, uint64_t symbol_index)
	Decodes one packed symbol at global index `symbol_index`.

template<typename Config >
__device__ __forceinline__ uint8_t	dense_packed_symbol_at_local (const uint64_t *word_tile, uint64_t first_word_index, uint64_t symbol_index)
	Decodes one packed symbol from a block-local word tile.

template<typename Config >
__device__ __forceinline__ bool	prepare_dense_packed_tiles (const uint64_t words, uint64_t block_start_kmer, uint64_t block_kmers, uint64_t word_tile, uint8_t *sequence_tile)
	Fills `sequence_tile` with encoded symbols for k-mers starting at `block_start_kmer`.

template<typename Config >
__global__ void	pack_dense_sequence_kernel (const char sequence, uint64_t num_symbols, uint64_t words)
	Packs an encoded byte sequence into dense `symbolBits-wide` `uint64_t` words.

Result< cuda_free_memory >	query_cuda_free_memory ()
	Queries current device free memory via `cudaMemGetInfo`.

constexpr size_t	fastx_chunk_slack_bytes () noexcept
	Reserved device memory left for allocator and kernel temporaries.

uint64_t	fastx_file_bytes (const std::filesystem::path &path)

template<typename Config >
constexpr uint64_t	fastx_record_symbol_count (uint64_t bases) noexcept

template<typename Config >
constexpr uint64_t	fastx_record_kmer_count (uint64_t bases) noexcept

template<typename Config >
constexpr uint64_t	estimate_normalized_sequence_bytes (uint64_t raw_bytes, uint64_t record_count) noexcept
	Upper bound on normalized sequence bytes for a raw host chunk.

template<typename Config >
constexpr uint64_t	estimate_insert_staging_bytes (uint64_t raw_bytes, uint64_t record_count) noexcept
	Peak device bytes for insert staging (`d_sequence_`) for a host chunk.

template<typename Config >
constexpr uint64_t	estimate_query_staging_bytes (uint64_t raw_bytes, uint64_t record_count) noexcept
	Peak device bytes for query staging (`d_sequence_` + `d_resultBuffer_`).

template<typename Config >
constexpr size_t	fastx_staging_budget_bytes (double fill_fraction, size_t free_bytes) noexcept
	Device staging byte budget derived from free VRAM and `fill_fraction`.

template<typename Config >
constexpr bool	fastx_chunk_reached_staging_budget (fastx_chunk_mode mode, size_t staging_budget_bytes, uint64_t raw_bytes, uint64_t record_count) noexcept
	Whether estimated device staging meets or exceeds `staging_budget_bytes`.

constexpr size_t	fastx_pipelined_chunk_budget (fastx_chunk_mode mode, size_t staging_budget_bytes) noexcept
	Per-chunk flush budget for dual-stream ping-pong (two device sequence buffers).

template<typename Config >
bool	fastx_chunk_should_flush (fastx_chunk_mode mode, size_t gpu_staging_budget_bytes, size_t host_chunk_max_bytes, uint64_t raw_chunk_bytes, uint64_t record_count) noexcept
	Whether a host chunk should flush based on GPU staging and host byte limits.

template<typename Config >
bool	fastx_file_fits_gpu_staging (const std::filesystem::path &path, fastx_chunk_mode mode, double fill_fraction)
	Whether the entire uncompressed file fits in one GPU staging pass.

template<typename Config >
Result< void >	validate_fastx_staging_fits (fastx_chunk_mode mode, double fill_fraction, uint64_t raw_bytes, uint64_t record_count, std::string_view source_name)

template<typename FastxReaderType >
Result< bool >	collect_next_fastx_record (FastxReaderType &reader, FastxRecord &record, DenseRecordBatchBuilder &chunk)
	Appends the next FASTX record from `reader` into `chunk`.

constexpr bool	fastx_is_single_chunk_path (fastx_dispatch_path path) noexcept
	True for fastx_dispatch_path::single_chunk_stream or fastx_dispatch_path::single_chunk_mmap.

constexpr bool	fastx_uses_mmap_reader (fastx_dispatch_path path) noexcept
	True when dispatch uses FastxBufferReader over an mmap'd file.

template<typename Config >
bool	fastx_fits_single_gpu_chunk (fastx_chunk_mode mode, double fill_fraction, uint64_t file_bytes)
	Whether the entire file fits in a single GPU staging chunk at `fill_fraction`.

uint64_t	fastx_single_chunk_stream_max_bytes ()
	Max raw file size for fastx_dispatch_path::single_chunk_stream (istream, no mmap).

template<typename Config >
fastx_dispatch_path	select_fastx_dispatch_path_for_file_bytes (uint64_t file_bytes, fastx_chunk_mode mode, double fill_fraction, bool file_fits_in_memory)
	Selects mmap vs stream and single- vs multi-chunk processing from file size.

template<typename Config >
fastx_dispatch_path	select_fastx_dispatch_path (const std::filesystem::path &path, fastx_chunk_mode mode, double fill_fraction)
	Selects mmap vs stream and single- vs multi-chunk processing for a path.

template<typename Config , fastx_dispatch_handler Handler>
fastx_dispatch_handler_result_t< Handler >	dispatch_fastx_file (const std::filesystem::path &path, fastx_chunk_mode mode, double fill_fraction, Handler &&handler)
	Opens a FASTX path and invokes `handler` with a reader and dispatch path.

bool	fastx_file_supports_memory_map (const std::filesystem::path &path)
	True when `path` is not gzip-compressed (mmap path is usable).

size_t	parse_host_chunk_max_bytes (const char *env_name)

size_t	fastx_host_chunk_max_bytes ()
	Optional host assembly byte cap before flush (debug / low-RAM safety valve).

constexpr bool	fastx_chunk_reached_host_byte_limit (size_t host_chunk_max_bytes, uint64_t raw_chunk_bytes) noexcept
	True when `raw_chunk_bytes` reaches the optional host assembly cap.

size_t	fastx_host_ram_slack_bytes ()

size_t	query_available_host_bytes ()
	Available physical RAM (bytes) for mmap budgeting.

uint64_t	fastx_memory_map_max_bytes ()
	Upper bound on file bytes that may be mmap'd (env cap and available RAM minus slack).

bool	fastx_file_fits_in_memory (const std::filesystem::path &path)
	True when uncompressed `path` size is within fastx_memory_map_max_bytes.

template<typename Config , typename FastxReaderType , typename Adapter >
Result< typename Adapter::report_type >	run_fastx_pipeline (FastxReaderType &reader, std::string_view source_name, double fill_fraction, cuda::stream_ref stream, fastx_dispatch_path dispatch_path, FastxPipelineState &state, Adapter &&adapter)

template<uint64_t Index>
__host__ __device__ __forceinline__ constexpr uint64_t	multiplicativeSaltLiteral ()
	Compile-time multiplicative salt for Bloom hash index `Index`.

template<typename Config , typename Fn , uint64_t... HashIndices>
__host__ __device__ __forceinline__ void	forEachHashIndexImpl (Fn &&fn, std::index_sequence< HashIndices... >)
	Unrolled invocation of `fn` for each Bloom hash index in `Config`.

template<typename Config , typename Fn >
__host__ __device__ __forceinline__ void	forEachHashIndex (Fn &&fn)
	Invokes `fn` once per Bloom hash index (compile-time unrolled).

template<typename Config , uint64_t Length>
__host__ __device__ __forceinline__ constexpr uint64_t	packedWindowMask ()
	Bit mask retaining the low `Length` symbols of a packed k-mer.

template<typename Config , uint64_t WindowLength, uint64_t K>
__host__ __device__ __forceinline__ constexpr uint64_t	extractPackedSubwindow (uint64_t packed_kmer, uint64_t start)
	Extracts an `m-mer` or `s-mer` subwindow from a packed k-mer at `start`.

__device__ __forceinline__ void	atomicOrWord (uint64_t *ptr, uint64_t value)
	64-bit atomic OR used for sectorized Bloom inserts.

uint64_t	parse_env_mebibytes (std::string_view value)
	Parses a decimal mebibyte count from `value`.

std::string_view	getenv_value (const char *env_name)
	Reads `env_name` via `getenv`, or an empty view when unset.

template<typename Config >
constexpr uint64_t	dense_packed_insert_word_tile_capacity ()
	Maximum `uint64_t` words loaded for a dense-packed insert block tile.

template<typename Config >
constexpr uint64_t	dense_packed_query_word_tile_capacity ()
	Maximum `uint64_t` words loaded for a dense-packed query block tile.

template<typename Config , uint32_t k_stride>
__device__ __forceinline__ void	contains_kmers_from_symbol_tile (const uint8_t *sequence_tile, uint64_t block_start_kmer, uint64_t block_kmers, bool block_all_valid, device_span< const filter_block< Config > > shards, device_span< uint8_t > output)
	Shared query path after a block symbol tile has been prepared.

template<typename Config , uint32_t warps_per_block>
__device__ __forceinline__ void	insert_kmers_from_symbol_tile (const uint8_t *sequence_tile, uint64_t block_start_kmer, uint64_t block_kmers, bool block_all_valid, device_span< filter_block< Config > > shards, cub::WarpReduce< uint64_t >::TempStorage reduce_storage[warps_per_block][4])
	Shared insert path after a block symbol tile has been prepared.

template<typename Config >
__global__ void	contains_sequence_kmers_kernel (SequenceKmerInput< Config > input, device_span< const filter_block< Config > > shards, device_span< uint8_t > output)
	Query kernel: one byte per k-mer (1 = present, 0 = absent or invalid).

template<typename Config >
__global__ void	insert_sequence_kmers_kernel (SequenceKmerInput< Config > input, device_span< filter_block< Config > > shards)
	Insert kernel: sectorized Bloom updates grouped by minimizer shard.

template<typename Config >
__global__ void	contains_dense_packed_kmers_kernel (DensePackedKmerInput< Config > input, device_span< const filter_block< Config > > shards, device_span< uint8_t > output)
	Query kernel for a dense packed symbol buffer (DensePackedKmerInput).

template<typename Config >
__global__ void	insert_dense_packed_kmers_kernel (DensePackedKmerInput< Config > input, device_span< filter_block< Config > > shards)
	Insert kernel for a dense packed symbol buffer (DensePackedKmerInput).

template<typename Config >
constexpr __host__ __device__ uint64_t	record_symbol_count (uint64_t bases) noexcept

template<typename Config >
constexpr __host__ __device__ uint64_t	record_kmer_count (uint64_t bases) noexcept

template<typename Config >
__device__ __forceinline__ uint64_t	packed_kmer_minimizer_hash (uint64_t packed_kmer)
	Minimum minimizer hash over all `m-mers` in a packed k-mer.

template<typename Config >
__device__ __forceinline__ uint64_t	packed_kmer_smer_hash (uint64_t packed_kmer, uint64_t start)
	Bloom hash for the s-mer at `start` within a packed k-mer.

template<typename Config >
__device__ __forceinline__ void	load_shard_words4 (const filter_block< Config > shards, uint64_t shard_index, uint64_t w)
	Loads four 64-bit shard words with 256-bit (sm_100+) or 128-bit vector loads.

template<typename Config , uint64_t K>
__device__ __forceinline__ uint64_t	pack_kmer_from_tile (const uint8_t *tile, uint64_t start)
	Packs `K` encoded symbols from a shared-memory tile starting at `start`.

template<typename Config , uint64_t K>
__device__ __forceinline__ uint64_t	advance_packed_kmer (uint64_t packed, uint8_t new_base)
	Slides a packed k-mer window by one encoded base.

template<typename Config >
__device__ __forceinline__ bool	kmer_is_valid (const uint8_t *tile, uint64_t start)
	True when no symbol in the k-mer window is the alphabet invalid sentinel.

template<typename Config >
__device__ __forceinline__ bool	prepare_sequence_hash_tiles (const char sequence, uint64_t block_start_kmer, uint64_t block_kmers, uint8_t sequence_tile)
	Encodes a block's sequence slice into `sequence_tile` and reports global validity.

template<uint32_t k_stride, typename Config >
__device__ __forceinline__ uint32_t	build_stride_kmer_valid_mask (uint64_t thread_offset, uint64_t block_kmers, bool block_all_valid, const uint8_t *sequence_tile)
	Builds the per-thread validity bitmask for strided query kernels.

template<typename T >
T	try_unwrap_success (Result< T > &result)

void	try_unwrap_success (Result< void > &result)

cuda::std::unexpected< Error >	propagate_error (const Error &error)
	Copies `error` for propagation (avoids moving out of `expected::error()`).

void	trimTrailingCarriageReturn (std::string &line)
	Removes a trailing carriage return from `line` if present (Windows line endings).

uint32_t	fastx_column_at (std::string_view line, size_t byte_index)
	1-based column at `byte_index` within `line` (clamped to the line end).

uint32_t	fastx_quality_excess_column (uint64_t quality_length, uint64_t expected_length, std::string_view line)
	1-based column of the first quality byte that exceeds `expected_length`.

uint32_t	fastx_quality_short_column (std::string_view line)
	1-based column where a quality run ends too short (position after the last byte).

Result< std::unique_ptr< std::istream > >	openFastxFile (const std::filesystem::path &path)
	Opens a FASTA/FASTQ file for reading.

bool	isGzipFile (const std::filesystem::path &path)
	True when `path` begins with the gzip magic bytes (`0x1F`, `0x8B`).

constexpr __host__ __device__ __forceinline__ uint64_t	hash64 (uint64_t key)
	Fast 64-bit integer hash (non-cryptographic).

constexpr __host__ __device__ __forceinline__ uint64_t	minimizer_hash64 (uint64_t key)
	Fast 64-bit hash sufficient for uniform minimizer selection.

template<typename T >
__device__ __forceinline__ void	load256BitGlobalNC (const T ptr, T out)
	Loads 256 bits from global memory using the non-coherent cache path.

__device__ __forceinline__ void	load256BitGlobalNC (const uint64_t *ptr, uint64_t &out0, uint64_t &out1, uint64_t &out2, uint64_t &out3)
	Loads four `uint64_t` words via `ld.global.nc.v4.u64` (sm_100+ only).

__device__ __forceinline__ void	load128BitGlobalNC (const uint64_t *ptr, uint64_t &out0, uint64_t &out1)
	Loads 128 bits from global memory using the non-coherent cache path.

__device__ __forceinline__ uint64_t	warpReduceOr (uint32_t peers, uint64_t value)
	OR-reduce a uint64_t across the lanes in a peer mask.

template<typename Kernel >
uint64_t	maxOccupancyGridSize (int32_t blockSize, Kernel kernel, uint64_t dynamicSMemSize)
	Calculates the maximum occupancy grid size for a kernel.

Variables
constexpr size_t	kDefaultFastxHostRamSlackBytes = 4u << 30
	Default headroom left for the OS and other processes when sizing mmap.

constexpr uint32_t	kContainsSequenceStride = 4
	K-mers processed per query thread per inner loop iteration.

constexpr uint64_t	kInvalidHash = std::numeric_limits<uint64_t>::max()
	Sentinel hash value indicating "no valid minimizer found".

Typedef Documentation

◆ fastx_dispatch_handler_result_t

template<typename Handler >

using cusbf::detail::fastx_dispatch_handler_result_t = typedef std::invoke_result_t<Handler&, FastxReader&, fastx_dispatch_path>

Return type of a fastx_dispatch_handler when invoked with a stream reader.

Definition at line 136 of file fastx_dispatch.hpp.

Enumeration Type Documentation

◆ fastx_chunk_mode

enum class cusbf::detail::fastx_chunk_mode

strong

FASTX mode used to size GPU staging buffers.

Enumerator
insert
query

Definition at line 24 of file fastx_chunk.cuh.

24{ insert, query };

cusbf::detail::fastx_chunk_mode::query

@ query

cusbf::detail::fastx_chunk_mode::insert

@ insert

◆ fastx_dispatch_path

enum class cusbf::detail::fastx_dispatch_path

strong

How a FASTX file is read and chunked for GPU processing.

Enumerator
single_chunk_stream	Whole file in one GPU chunk, stream via `istream` (no mmap).
single_chunk_mmap	Whole file in one GPU chunk, mmap'd when it fits in host RAM.
chunked_mmap	Multiple GPU chunks, file mmap'd when it fits in host RAM.
chunked_stream	Multiple GPU chunks, stream via `istream` (gzip or larger than RAM).

Definition at line 29 of file fastx_dispatch.hpp.

                               {
    single_chunk_stream,
    single_chunk_mmap,
    chunked_mmap,
    chunked_stream,
};

◆ FastxFormat

enum class cusbf::detail::FastxFormat : uint8_t

strong

Detected file format for a FASTA/FASTQ stream.

Enumerator
unknown	Format not yet determined from the first header.
fasta	FASTA (`>` headers).
fastq	FASTQ (`@` headers).

Definition at line 243 of file Fastx.hpp.

                       : uint8_t {
    unknown,
    fasta,
    fastq,
};

Function Documentation

◆ advance_packed_kmer()

template<typename Config , uint64_t K>

__device__ __forceinline__ uint64_t cusbf::detail::advance_packed_kmer	(	uint64_t	packed,
		uint8_t	new_base
	)

Slides a packed k-mer window by one encoded base.

Definition at line 86 of file sequence_kmer.cuh.

                                                                                           {
    return ((packed << Config::symbolBits) | (new_base & Config::symbolMask)) &
           packedWindowMask<Config, K>();
}

Here is the call graph for this function:

◆ atomicOrWord()

__device__ __forceinline__ void cusbf::detail::atomicOrWord	(	uint64_t *	ptr,
		uint64_t	value
	)

64-bit atomic OR used for sectorized Bloom inserts.

Definition at line 137 of file filter_common.cuh.

                                                                            {
    atomicOr(reinterpret_cast<unsigned long long*>(ptr), static_cast<unsigned long long>(value));
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_stride_kmer_valid_mask()

template<uint32_t k_stride, typename Config >

__device__ __forceinline__ uint32_t cusbf::detail::build_stride_kmer_valid_mask	(	uint64_t	thread_offset,
		uint64_t	block_kmers,
		bool	block_all_valid,
		const uint8_t *	sequence_tile
	)

Builds the per-thread validity bitmask for strided query kernels.

Definition at line 129 of file sequence_kmer.cuh.

  {
    uint32_t kmer_valid_mask = 0;
    _Pragma("unroll")
    for (uint32_t s = 0; s < k_stride; ++s) {
        if ((thread_offset + s) < block_kmers) {
            kmer_valid_mask |= (1u << s);
        }
    }
 
    if (!block_all_valid) {
        _Pragma("unroll")
        for (uint32_t s = 0; s < k_stride; ++s) {
            if (!(kmer_valid_mask & (1u << s))) {
                continue;
            }
            const uint64_t local_idx = thread_offset + s;
            if (!kmer_is_valid<Config>(sequence_tile, local_idx)) {
                kmer_valid_mask &= ~(1u << s);
            }
        }
    }
    return kmer_valid_mask;
}

Here is the call graph for this function:

◆ collect_next_fastx_record()

template<typename FastxReaderType >

Result< bool > cusbf::detail::collect_next_fastx_record	(	FastxReaderType &	reader,
		FastxRecord &	record,
		DenseRecordBatchBuilder &	chunk
	)

inline

Appends the next FASTX record from reader into chunk.

Definition at line 13 of file fastx_dense_batch.hpp.

  {
    if constexpr (std::is_same_v<std::decay_t<FastxReaderType>, FastxBufferReader>) {
        const auto range = CUSBF_TRY(
            reader.appendNextRecord(record, chunk.sequence_buffer(), chunk.external_sequence_slot())
        );
        if (!range) {
            return false;
        }
        chunk.push_range(*range);
        return true;
    }
 
    const bool has_record = CUSBF_TRY(reader.nextRecord(record));
    if (!has_record) {
        return false;
    }
    chunk.appendRecord(record.sequence);
    return true;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ contains_dense_packed_kmers_kernel()

template<typename Config >

__global__ void cusbf::detail::contains_dense_packed_kmers_kernel	(	DensePackedKmerInput< Config >	input,
		device_span< const filter_block< Config > >	shards,
		device_span< uint8_t >	output
	)

Query kernel for a dense packed symbol buffer (DensePackedKmerInput).

Definition at line 248 of file kernels.cuh.

  {
    constexpr uint32_t k_stride = kContainsSequenceStride;
    constexpr uint64_t sequence_tile_bases = Config::cudaBlockSize * k_stride + Config::k - 1;
    constexpr uint64_t word_tile_capacity = dense_packed_query_word_tile_capacity<Config>();
 
    __shared__ uint64_t word_tile[word_tile_capacity];
    __shared__ uint8_t sequence_tile[sequence_tile_bases];
 
    const uint64_t num_kmers = input.kmerCount();
    const uint64_t block_start_kmer =
        static_cast<uint64_t>(blockIdx.x) * Config::cudaBlockSize * k_stride;
    if (block_start_kmer >= num_kmers) {
        return;
    }
 
    const uint64_t block_kmers =
        min(Config::cudaBlockSize * k_stride, num_kmers - block_start_kmer);
 
    const bool block_all_valid = prepare_dense_packed_tiles<Config>(
        input.words.data(), block_start_kmer, block_kmers, word_tile, sequence_tile
    );
 
    contains_kmers_from_symbol_tile<Config, k_stride>(
        sequence_tile, block_start_kmer, block_kmers, block_all_valid, shards, output
    );
}

Here is the call graph for this function:

◆ contains_kmers_from_symbol_tile()

template<typename Config , uint32_t k_stride>

__device__ __forceinline__ void cusbf::detail::contains_kmers_from_symbol_tile	(	const uint8_t *	sequence_tile,
		uint64_t	block_start_kmer,
		uint64_t	block_kmers,
		bool	block_all_valid,
		device_span< const filter_block< Config > >	shards,
		device_span< uint8_t >	output
	)

Shared query path after a block symbol tile has been prepared.

Used by both byte-sequence and dense-packed symbol kernels.

Definition at line 41 of file kernels.cuh.

  {
    const uint64_t thread_offset = static_cast<uint64_t>(threadIdx.x) * k_stride;
    if (thread_offset >= block_kmers) {
        return;
    }
 
    const uint32_t kmer_valid_mask = build_stride_kmer_valid_mask<k_stride, Config>(
        thread_offset, block_kmers, block_all_valid, sequence_tile
    );
 
    uint64_t packed_kmer = pack_kmer_from_tile<Config, Config::k>(sequence_tile, thread_offset);
    filter_ref<Config> ref;
 
    for (uint32_t s = 0; s < k_stride; ++s) {
        const uint64_t local_idx = thread_offset + s;
        if (local_idx >= block_kmers) {
            break;
        }
 
        const uint64_t kmer_index = block_start_kmer + local_idx;
 
        if (s > 0) {
            packed_kmer = advance_packed_kmer<Config, Config::k>(
                packed_kmer, sequence_tile[local_idx + Config::k - 1]
            );
        }
 
        if (!(kmer_valid_mask & (1u << s))) {
            output[kmer_index] = 0;
            continue;
        }
 
        const uint64_t minimizer_hash = packed_kmer_minimizer_hash<Config>(packed_kmer);
 
        const auto shard_idx =
            static_cast<uint32_t>(filter_ref<Config>::shard_index(minimizer_hash, shards.size()));
        const uint32_t peers = __match_any_sync(0xFFFFFFFFu, shard_idx);
        const int leader = __ffs(static_cast<int>(peers)) - 1;
 
        uint64_t w[4];
        if (static_cast<int>(threadIdx.x & 31u) == leader) {
            load_shard_words4<Config>(shards.data(), shard_idx, w);
        }
        w[0] = __shfl_sync(peers, w[0], leader);
        w[1] = __shfl_sync(peers, w[1], leader);
        w[2] = __shfl_sync(peers, w[2], leader);
        w[3] = __shfl_sync(peers, w[3], leader);
 
        const bool present = ref.sectorized_contains_packed_kmer(packed_kmer, w);
        output[kmer_index] = present;
    }
}

Here is the call graph for this function:

◆ contains_sequence_kmers_kernel()

template<typename Config >

__global__ void cusbf::detail::contains_sequence_kmers_kernel	(	SequenceKmerInput< Config >	input,
		device_span< const filter_block< Config > >	shards,
		device_span< uint8_t >	output
	)

Query kernel: one byte per k-mer (1 = present, 0 = absent or invalid).

Threads stride kContainsSequenceStride k-mers, warps sharing a shard load it once.

Definition at line 180 of file kernels.cuh.

  {
    constexpr uint32_t k_stride = kContainsSequenceStride;
    constexpr uint64_t sequence_tile_bases = Config::cudaBlockSize * k_stride + Config::k - 1;
 
    __shared__ uint8_t sequence_tile[sequence_tile_bases];
 
    const uint64_t num_kmers = input.kmerCount();
    const uint64_t block_start_kmer =
        static_cast<uint64_t>(blockIdx.x) * Config::cudaBlockSize * k_stride;
    if (block_start_kmer >= num_kmers) {
        return;
    }
 
    const uint64_t block_kmers =
        min(Config::cudaBlockSize * k_stride, num_kmers - block_start_kmer);
 
    const bool block_all_valid = prepare_sequence_hash_tiles<Config>(
        input.sequence.data(), block_start_kmer, block_kmers, sequence_tile
    );
 
    contains_kmers_from_symbol_tile<Config, k_stride>(
        sequence_tile, block_start_kmer, block_kmers, block_all_valid, shards, output
    );
}

Here is the call graph for this function:

◆ count_positive_kmers_per_record()

template<typename Config >

Result< void > cusbf::detail::count_positive_kmers_per_record	(	device_span< const uint8_t >	hits,
		device_span< const QueryLayoutRecord >	records,
		device_span< uint64_t >	positive_kmers_out,
		cuda::stream_ref	stream
	)

inline

Fills positive_kmers_out with per-record positive k-mer counts.

positive_kmers_out must hold at least records.size() elements.

Definition at line 70 of file count_positive_kmers.cuh.

  {
    if (records.empty()) {
        return {};
    }
    if (positive_kmers_out.size() < records.size()) {
        return Err(Error::invalid_argument("positive k-mer output buffer is too small"));
    }
 
    const uint32_t block_size = 256;
    const uint32_t grid_size = cuda::ceil_div(records.size(), static_cast<uint64_t>(block_size));
    count_positive_kmers_per_record_kernel<Config><<<grid_size, block_size, 0, stream.get()>>>(
        hits.data(), records.data(), positive_kmers_out.data(), records.size()
    );
    CUSBF_CUDA_TRY(cudaGetLastError());
    return {};
}

Here is the call graph for this function:

◆ count_positive_kmers_per_record_kernel()

template<typename Config >

__global__ void cusbf::detail::count_positive_kmers_per_record_kernel	(	const uint8_t *	hits,
		const QueryLayoutRecord *	records,
		uint64_t *	positive_kmers_out,
		uint64_t	record_count
	)

Per-record kernel: sums hits[hit_offset ..] for each QueryLayoutRecord.

Definition at line 22 of file count_positive_kmers.cuh.

  {
    const uint64_t record_index = static_cast<uint64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
    if (record_index >= record_count) {
        return;
    }
 
    const QueryLayoutRecord& record = records[record_index];
    if (record.hit_count == 0) {
        positive_kmers_out[record_index] = 0;
        return;
    }
 
    uint64_t positive = 0;
    const uint64_t begin = record.hit_offset;
    for (uint64_t i = 0; i < record.hit_count; ++i) {
        positive += hits[begin + i];
    }
    positive_kmers_out[record_index] = positive;
}

Here is the call graph for this function:

◆ count_positive_kmers_total()

template<typename Config >

uint64_t cusbf::detail::count_positive_kmers_total	(	device_span< const uint8_t >	hits,
		cuda::stream_ref	stream
	)

inline

Device-wide count of set bits in a per-k-mer hit buffer.

Definition at line 50 of file count_positive_kmers.cuh.

                                                                                   {
    if (hits.empty()) {
        return 0;
    }
 
    const auto execution = thrust::cuda::par.on(stream.get());
    return static_cast<uint64_t>(thrust::count(
        execution,
        thrust::device_pointer_cast(hits.data()),
        thrust::device_pointer_cast(hits.data()) + hits.size(),
        uint8_t{1}
    ));
}

Here is the call graph for this function:

◆ dense_packed_insert_word_tile_capacity()

template<typename Config >

constexpr uint64_t cusbf::detail::dense_packed_insert_word_tile_capacity ( )

constexpr

Maximum uint64_t words loaded for a dense-packed insert block tile.

Definition at line 22 of file kernels.cuh.

                                                                          {
    constexpr uint64_t tile_symbols = Config::cudaBlockSize + Config::k - 1;
    return cuda::ceil_div(tile_symbols, dense_packed_symbols_per_word<Config>());
}

Here is the call graph for this function:

◆ dense_packed_kmer_count()

template<typename Config >

constexpr uint64_t cusbf::detail::dense_packed_kmer_count ( uint64_t num_symbols )

constexpr

Returns the number of k-mer windows in a dense packed symbol sequence.

Definition at line 26 of file dense_packed.cuh.

                                                                               {
    return num_symbols < Config::k ? 0 : (num_symbols - Config::k + 1);
}

◆ dense_packed_query_word_tile_capacity()

template<typename Config >

constexpr uint64_t cusbf::detail::dense_packed_query_word_tile_capacity ( )

constexpr

Maximum uint64_t words loaded for a dense-packed query block tile.

Definition at line 29 of file kernels.cuh.

                                                                         {
    constexpr uint64_t tile_symbols =
        Config::cudaBlockSize * kContainsSequenceStride + Config::k - 1;
    return cuda::ceil_div(tile_symbols, dense_packed_symbols_per_word<Config>());
}

Here is the call graph for this function:

◆ dense_packed_symbol_at()

template<typename Config >

__device__ __forceinline__ uint8_t cusbf::detail::dense_packed_symbol_at	(	const uint64_t *	words,
		uint64_t	symbol_index
	)

Decodes one packed symbol at global index symbol_index.

Definition at line 59 of file dense_packed.cuh.

                                                                     {
    constexpr uint64_t symbols_per_word = dense_packed_symbols_per_word<Config>();
    const uint64_t word_index = symbol_index / symbols_per_word;
    const auto bit_offset =
        static_cast<unsigned>((symbol_index % symbols_per_word) * Config::symbolBits);
    return static_cast<uint8_t>((words[word_index] >> bit_offset) & Config::symbolMask);
}

Here is the call graph for this function:

◆ dense_packed_symbol_at_local()

template<typename Config >

__device__ __forceinline__ uint8_t cusbf::detail::dense_packed_symbol_at_local	(	const uint64_t *	word_tile,
		uint64_t	first_word_index,
		uint64_t	symbol_index
	)

Decodes one packed symbol from a block-local word tile.

Definition at line 69 of file dense_packed.cuh.

  {
    constexpr uint64_t symbols_per_word = dense_packed_symbols_per_word<Config>();
    const uint64_t local_word = (symbol_index / symbols_per_word) - first_word_index;
    const auto bit_offset =
        static_cast<unsigned>((symbol_index % symbols_per_word) * Config::symbolBits);
    return static_cast<uint8_t>((word_tile[local_word] >> bit_offset) & Config::symbolMask);
}

Here is the call graph for this function:

◆ dense_packed_symbols_per_word()

template<typename Config >

constexpr uint64_t cusbf::detail::dense_packed_symbols_per_word ( )

constexpr

Encoded symbols stored in each uint64_t word for Config.

Definition at line 14 of file dense_packed.cuh.

                                                                 {
    return 64 / Config::symbolBits;
}

◆ dense_packed_word_count()

template<typename Config >

constexpr uint64_t cusbf::detail::dense_packed_word_count ( uint64_t num_symbols )

constexpr

Returns the number of uint64_t words required for num_symbols encoded symbols.

Definition at line 20 of file dense_packed.cuh.

                                                                               {
    return cuda::ceil_div(num_symbols, dense_packed_symbols_per_word<Config>());
}

Here is the call graph for this function:

◆ dispatch_fastx_file()

template<typename Config , fastx_dispatch_handler Handler>

fastx_dispatch_handler_result_t< Handler > cusbf::detail::dispatch_fastx_file	(	const std::filesystem::path &	path,
		fastx_chunk_mode	mode,
		double	fill_fraction,
		Handler &&	handler
	)

Opens a FASTX path and invokes handler with a reader and dispatch path.

handler receives the reader and how the file was opened. Small files use fastx_dispatch_path::single_chunk_stream, GPU-sized inputs use fastx_dispatch_path::single_chunk_mmap, larger inputs use pipelined mmap or stream.

Definition at line 159 of file fastx_dispatch.hpp.

  {
    const std::string path_string = path.string();
    const std::string_view path_view{path_string};
    const fastx_dispatch_path dispatch_path =
        select_fastx_dispatch_path<Config>(path, mode, fill_fraction);
 
    if (fastx_uses_mmap_reader(dispatch_path)) {
        const auto buffer = FastxFileBuffer::load(path);
        if (!buffer) {
            return Err(buffer.error());
        }
        FastxBufferReader reader((*buffer)->data(), path_view);
        return handler(reader, dispatch_path);
    }
 
    const auto input = openFastxFile(path);
    if (!input) {
        return Err(input.error());
    }
    FastxReader reader(**input, path_view);
    return handler(reader, dispatch_path);
}

Here is the call graph for this function:

◆ estimate_insert_staging_bytes()

template<typename Config >

constexpr uint64_t cusbf::detail::estimate_insert_staging_bytes	(	uint64_t	raw_bytes,
		uint64_t	record_count
	)

constexprnoexcept

Peak device bytes for insert staging (d_sequence_) for a host chunk.

Definition at line 82 of file fastx_chunk.cuh.

                                                                                  {
    return estimate_normalized_sequence_bytes<Config>(raw_bytes, record_count);
}

Here is the call graph for this function:

◆ estimate_normalized_sequence_bytes()

template<typename Config >

constexpr uint64_t cusbf::detail::estimate_normalized_sequence_bytes	(	uint64_t	raw_bytes,
		uint64_t	record_count
	)

constexprnoexcept

Upper bound on normalized sequence bytes for a raw host chunk.

Definition at line 75 of file fastx_chunk.cuh.

                                                                                       {
    return raw_bytes + record_count * 2 * Config::symbolWidth;
}

Here is the call graph for this function:

◆ estimate_query_staging_bytes()

template<typename Config >

constexpr uint64_t cusbf::detail::estimate_query_staging_bytes	(	uint64_t	raw_bytes,
		uint64_t	record_count
	)

constexprnoexcept

Peak device bytes for query staging (d_sequence_ + d_resultBuffer_).

Definition at line 89 of file fastx_chunk.cuh.

                                                                                 {
    const uint64_t normalized = estimate_normalized_sequence_bytes<Config>(raw_bytes, record_count);
    return normalized + fastx_record_kmer_count<Config>(normalized);
}

Here is the call graph for this function:

◆ extractPackedSubwindow()

template<typename Config , uint64_t WindowLength, uint64_t K>

__host__ __device__ __forceinline__ constexpr uint64_t cusbf::detail::extractPackedSubwindow	(	uint64_t	packed_kmer,
		uint64_t	start
	)

constexpr

Extracts an m-mer or s-mer subwindow from a packed k-mer at start.

Definition at line 130 of file filter_common.cuh.

                                                             {
    static_assert(WindowLength <= K, "WindowLength must not exceed K");
    return (packed_kmer >> (Config::symbolBits * (K - (start + WindowLength)))) &
           packedWindowMask<Config, WindowLength>();
}

Here is the call graph for this function:

◆ fastx_chunk_reached_host_byte_limit()

constexpr bool cusbf::detail::fastx_chunk_reached_host_byte_limit	(	size_t	host_chunk_max_bytes,
		uint64_t	raw_chunk_bytes
	)

inlineconstexprnoexcept

True when raw_chunk_bytes reaches the optional host assembly cap.

Definition at line 40 of file fastx_host_limits.cuh.

           {
    return host_chunk_max_bytes != std::numeric_limits<size_t>::max() &&
           raw_chunk_bytes >= host_chunk_max_bytes;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fastx_chunk_reached_staging_budget()

template<typename Config >

constexpr bool cusbf::detail::fastx_chunk_reached_staging_budget	(	fastx_chunk_mode	mode,
		size_t	staging_budget_bytes,
		uint64_t	raw_bytes,
		uint64_t	record_count
	)

constexprnoexcept

Whether estimated device staging meets or exceeds staging_budget_bytes.

Definition at line 110 of file fastx_chunk.cuh.

           {
    if (raw_bytes == 0) {
        return false;
    }
    if (staging_budget_bytes == 0) {
        return true;
    }
 
    const uint64_t staging = mode == fastx_chunk_mode::insert
                                 ? estimate_insert_staging_bytes<Config>(raw_bytes, record_count)
                                 : estimate_query_staging_bytes<Config>(raw_bytes, record_count);
    return staging >= staging_budget_bytes;
}

Here is the call graph for this function:

◆ fastx_chunk_should_flush()

template<typename Config >

bool cusbf::detail::fastx_chunk_should_flush	(	fastx_chunk_mode	mode,
		size_t	gpu_staging_budget_bytes,
		size_t	host_chunk_max_bytes,
		uint64_t	raw_chunk_bytes,
		uint64_t	record_count
	)

inlinenoexcept

Whether a host chunk should flush based on GPU staging and host byte limits.

Definition at line 143 of file fastx_chunk.cuh.

           {
    return fastx_chunk_reached_staging_budget<Config>(
               mode, gpu_staging_budget_bytes, raw_chunk_bytes, record_count
           ) ||
           fastx_chunk_reached_host_byte_limit(host_chunk_max_bytes, raw_chunk_bytes);
}

Here is the call graph for this function:

◆ fastx_chunk_slack_bytes()

constexpr size_t cusbf::detail::fastx_chunk_slack_bytes ( )

constexprnoexcept

Reserved device memory left for allocator and kernel temporaries.

Definition at line 44 of file fastx_chunk.cuh.

                                                                  {
    return 64u << 20;
}

Here is the caller graph for this function:

◆ fastx_column_at()

uint32_t cusbf::detail::fastx_column_at	(	std::string_view	line,
		size_t	byte_index
	)

inline

1-based column at byte_index within line (clamped to the line end).

Definition at line 268 of file Fastx.hpp.

                                                                                      {
    if (line.empty()) {
        return 1;
    }
    return static_cast<uint32_t>(std::min(byte_index, line.size() - 1) + 1);
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fastx_file_bytes()

uint64_t cusbf::detail::fastx_file_bytes ( const std::filesystem::path & path )

inline

Definition at line 48 of file fastx_chunk.cuh.

                                                                              {
#if defined(__linux__)
    const std::string path_string = path.string();
    struct stat file_status{};
    if (::stat(path_string.c_str(), &file_status) != 0 || file_status.st_size < 0) {
        return 0;
    }
    return static_cast<uint64_t>(file_status.st_size);
#else
    (void)path;
    return 0;
#endif
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fastx_file_fits_gpu_staging()

template<typename Config >

bool cusbf::detail::fastx_file_fits_gpu_staging	(	const std::filesystem::path &	path,
		fastx_chunk_mode	mode,
		double	fill_fraction
	)

inline

Whether the entire uncompressed file fits in one GPU staging pass.

Definition at line 158 of file fastx_chunk.cuh.

  {
    const uint64_t file_bytes = fastx_file_bytes(path);
    if (file_bytes == 0) {
        return true;
    }
 
    const auto gpu_memory = query_cuda_free_memory();
    if (!gpu_memory) {
        return false;
    }
    const size_t staging_budget_bytes =
        fastx_staging_budget_bytes<Config>(fill_fraction, gpu_memory->free_bytes);
    return !fastx_chunk_reached_staging_budget<Config>(mode, staging_budget_bytes, file_bytes, 1);
}

Here is the call graph for this function:

◆ fastx_file_fits_in_memory()

bool cusbf::detail::fastx_file_fits_in_memory ( const std::filesystem::path & path )

inline

True when uncompressed path size is within fastx_memory_map_max_bytes.

Definition at line 66 of file fastx_host_memory.cuh.

                                                                                   {
#if defined(__linux__)
    const std::string path_string = path.string();
    struct stat file_status{};
    if (::stat(path_string.c_str(), &file_status) != 0 || file_status.st_size < 0) {
        return false;
    }
    const auto file_bytes = static_cast<uint64_t>(file_status.st_size);
    return file_bytes <= fastx_memory_map_max_bytes();
#else
    (void)path;
    return false;
#endif
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fastx_file_supports_memory_map()

bool cusbf::detail::fastx_file_supports_memory_map ( const std::filesystem::path & path )

inline

True when path is not gzip-compressed (mmap path is usable).

Definition at line 164 of file fastx_file_buffer.hpp.

                                                                                        {
    return !isGzipFile(path);
}

Here is the call graph for this function:

◆ fastx_fits_single_gpu_chunk()

template<typename Config >

bool cusbf::detail::fastx_fits_single_gpu_chunk	(	fastx_chunk_mode	mode,
		double	fill_fraction,
		uint64_t	file_bytes
	)

inline

Whether the entire file fits in a single GPU staging chunk at fill_fraction.

Definition at line 56 of file fastx_dispatch.hpp.

                                                                                              {
    if (file_bytes == 0) {
        return true;
    }
 
    const auto gpu_memory = query_cuda_free_memory();
    if (!gpu_memory) {
        return false;
    }
    const size_t staging_budget_bytes =
        fastx_staging_budget_bytes<Config>(fill_fraction, gpu_memory->free_bytes);
    if (staging_budget_bytes == 0) {
        return true;
    }
 
    return !fastx_chunk_reached_staging_budget<Config>(mode, staging_budget_bytes, file_bytes, 1);
}

Here is the call graph for this function:

◆ fastx_host_chunk_max_bytes()

size_t cusbf::detail::fastx_host_chunk_max_bytes ( )

inline

Optional host assembly byte cap before flush (debug / low-RAM safety valve).

Returns SIZE_MAX when unset so fastx_chunk_should_flush uses GPU staging only.

Definition at line 25 of file fastx_host_limits.cuh.

                                                         {
    if (const size_t override_bytes = parse_host_chunk_max_bytes("CUSBF_FASTX_MAX_HOST_CHUNK_MB");
        override_bytes != 0) {
        return override_bytes;
    }
 
    if (const size_t large_bytes = parse_host_chunk_max_bytes("CUSBF_LARGE_FASTX_HOST_CHUNK_MB");
        large_bytes != 0) {
        return large_bytes;
    }
 
    return std::numeric_limits<size_t>::max();
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fastx_host_ram_slack_bytes()

size_t cusbf::detail::fastx_host_ram_slack_bytes ( )

inline

Definition at line 22 of file fastx_host_memory.cuh.

                                                         {
    const std::string_view value = getenv_value("CUSBF_FASTX_HOST_RAM_SLACK_MB");
    if (value.empty()) {
        return kDefaultFastxHostRamSlackBytes;
    }
 
    const uint64_t mebibytes = parse_env_mebibytes(value);
    if (mebibytes == 0) {
        return kDefaultFastxHostRamSlackBytes;
    }
    return static_cast<size_t>(mebibytes) << 20;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fastx_is_single_chunk_path()

constexpr bool cusbf::detail::fastx_is_single_chunk_path ( fastx_dispatch_path path )

constexprnoexcept

True for fastx_dispatch_path::single_chunk_stream or fastx_dispatch_path::single_chunk_mmap.

Definition at line 42 of file fastx_dispatch.hpp.

                                                                                           {
    return path == fastx_dispatch_path::single_chunk_stream ||
           path == fastx_dispatch_path::single_chunk_mmap;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fastx_memory_map_max_bytes()

uint64_t cusbf::detail::fastx_memory_map_max_bytes ( )

inline

Upper bound on file bytes that may be mmap'd (env cap and available RAM minus slack).

Definition at line 48 of file fastx_host_memory.cuh.

                                                           {
    uint64_t cap_bytes = UINT64_MAX;
    if (const uint64_t mebibytes = parse_env_mebibytes(getenv_value("CUSBF_FASTX_MMAP_MAX_MB"));
        mebibytes != 0) {
        cap_bytes = mebibytes << 20;
    }
 
    const size_t available = query_available_host_bytes();
    if (available == 0) {
        return cap_bytes;
    }
 
    const size_t slack = fastx_host_ram_slack_bytes();
    const size_t ram_budget = available > slack ? available - slack : size_t{0};
    return std::min(cap_bytes, static_cast<uint64_t>(ram_budget));
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fastx_pipelined_chunk_budget()

constexpr size_t cusbf::detail::fastx_pipelined_chunk_budget	(	fastx_chunk_mode	mode,
		size_t	staging_budget_bytes
	)

constexprnoexcept

Per-chunk flush budget for dual-stream ping-pong (two device sequence buffers).

Definition at line 131 of file fastx_chunk.cuh.

                                                                                          {
    if (staging_budget_bytes == 0) {
        return 0;
    }
    if (mode == fastx_chunk_mode::insert) {
        return staging_budget_bytes / 2;
    }
    return staging_budget_bytes / 3;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fastx_quality_excess_column()

uint32_t cusbf::detail::fastx_quality_excess_column	(	uint64_t	quality_length,
		uint64_t	expected_length,
		std::string_view	line
	)

inline

1-based column of the first quality byte that exceeds expected_length.

Definition at line 276 of file Fastx.hpp.

  {
    const uint64_t before = quality_length - line.size();
    const size_t byte_index = expected_length > before ? expected_length - before : line.size();
    return fastx_column_at(line, byte_index);
}

Here is the call graph for this function:

◆ fastx_quality_short_column()

uint32_t cusbf::detail::fastx_quality_short_column ( std::string_view line )

inline

1-based column where a quality run ends too short (position after the last byte).

Definition at line 287 of file Fastx.hpp.

                                                                              {
    return static_cast<uint32_t>(line.empty() ? 1 : line.size() + 1);
}

Here is the call graph for this function:

◆ fastx_record_kmer_count()

template<typename Config >

constexpr uint64_t cusbf::detail::fastx_record_kmer_count ( uint64_t bases )

constexprnoexcept

Definition at line 68 of file fastx_chunk.cuh.

                                                                                  {
    return record_kmer_count<Config>(bases);
}

Here is the call graph for this function:

◆ fastx_record_symbol_count()

template<typename Config >

constexpr uint64_t cusbf::detail::fastx_record_symbol_count ( uint64_t bases )

constexprnoexcept

Definition at line 63 of file fastx_chunk.cuh.

                                                                                    {
    return record_symbol_count<Config>(bases);
}

Here is the call graph for this function:

◆ fastx_single_chunk_stream_max_bytes()

uint64_t cusbf::detail::fastx_single_chunk_stream_max_bytes ( )

inline

Max raw file size for fastx_dispatch_path::single_chunk_stream (istream, no mmap).

Larger files that still fit one GPU chunk use fastx_dispatch_path::single_chunk_mmap.

Definition at line 77 of file fastx_dispatch.hpp.

                                                                    {
    constexpr uint64_t kDefaultBytes = 32u << 20;
    const uint64_t mebibytes =
        parse_env_mebibytes(getenv_value("CUSBF_FASTX_SINGLE_CHUNK_STREAM_MAX_MB"));
    if (mebibytes == 0) {
        return kDefaultBytes;
    }
    return mebibytes << 20;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fastx_staging_budget_bytes()

template<typename Config >

constexpr size_t cusbf::detail::fastx_staging_budget_bytes	(	double	fill_fraction,
		size_t	free_bytes
	)

constexprnoexcept

Device staging byte budget derived from free VRAM and fill_fraction.

Definition at line 97 of file fastx_chunk.cuh.

                                                                             {
    if (fill_fraction <= 0.0) {
        return 0;
    }
 
    const size_t available =
        free_bytes > fastx_chunk_slack_bytes() ? free_bytes - fastx_chunk_slack_bytes() : size_t{0};
    const double budget = static_cast<double>(available) * fill_fraction;
    return budget <= 0.0 ? size_t{0} : static_cast<size_t>(budget);
}

Here is the call graph for this function:

◆ fastx_uses_mmap_reader()

constexpr bool cusbf::detail::fastx_uses_mmap_reader ( fastx_dispatch_path path )

constexprnoexcept

True when dispatch uses FastxBufferReader over an mmap'd file.

Definition at line 48 of file fastx_dispatch.hpp.

                                                                                       {
    return path == fastx_dispatch_path::single_chunk_mmap ||
           path == fastx_dispatch_path::chunked_mmap;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ forEachHashIndex()

template<typename Config , typename Fn >

__host__ __device__ __forceinline__ void cusbf::detail::forEachHashIndex ( Fn && fn )

Invokes fn once per Bloom hash index (compile-time unrolled).

Definition at line 111 of file filter_common.cuh.

                                                                   {
    forEachHashIndexImpl<Config>(
        static_cast<Fn&&>(fn), std::make_index_sequence<Config::hashCount>{}
    );
}

Here is the call graph for this function:

◆ forEachHashIndexImpl()

template<typename Config , typename Fn , uint64_t... HashIndices>

__host__ __device__ __forceinline__ void cusbf::detail::forEachHashIndexImpl	(	Fn &&	fn,
		std::index_sequence< HashIndices... >
	)

Unrolled invocation of fn for each Bloom hash index in Config.

Definition at line 105 of file filter_common.cuh.

                                                                 {
    (fn(std::integral_constant<uint64_t, HashIndices>{}), ...);
}

Here is the call graph for this function:

◆ getenv_value()

std::string_view cusbf::detail::getenv_value ( const char * env_name )

inline

Reads env_name via getenv, or an empty view when unset.

Definition at line 32 of file host_parse.hpp.

                                                                       {
    const char* value = std::getenv(env_name);
    if (value == nullptr || value[0] == '\0') {
        return {};
    }
    return value;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ hash64()

constexpr __host__ __device__ __forceinline__ uint64_t cusbf::detail::hash64 ( uint64_t key )

constexpr

Fast 64-bit integer hash (non-cryptographic).

One multiplicative step followed by an xorshift. Used to hash s-mer packed representations for Bloom bit-position selection.

Parameters

key	Input value.

Returns: Hashed value.

Definition at line 192 of file hashutil.cuh.

                                                                            {
    key *= 0x9e3779b97f4a7c15ULL;
    key ^= key >> 33;
    return key;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ insert_dense_packed_kmers_kernel()

template<typename Config >

__global__ void cusbf::detail::insert_dense_packed_kmers_kernel	(	DensePackedKmerInput< Config >	input,
		device_span< filter_block< Config > >	shards
	)

Insert kernel for a dense packed symbol buffer (DensePackedKmerInput).

Definition at line 283 of file kernels.cuh.

  {
    constexpr uint64_t sequence_tile_bases = Config::cudaBlockSize + Config::k - 1;
    constexpr uint32_t warps_per_block = Config::cudaBlockSize / 32;
    constexpr uint64_t word_tile_capacity = dense_packed_insert_word_tile_capacity<Config>();
 
    using WarpReduceWord = cub::WarpReduce<uint64_t>;
 
    __shared__ uint64_t word_tile[word_tile_capacity];
    __shared__ uint8_t sequence_tile[sequence_tile_bases];
    __shared__ typename WarpReduceWord::TempStorage reduce_storage[warps_per_block][4];
 
    const uint64_t num_kmers = input.kmerCount();
    const uint64_t block_start_kmer = static_cast<uint64_t>(blockIdx.x) * Config::cudaBlockSize;
    if (block_start_kmer >= num_kmers) {
        return;
    }
 
    const uint64_t block_kmers = min(Config::cudaBlockSize, num_kmers - block_start_kmer);
 
    const bool block_all_valid = prepare_dense_packed_tiles<Config>(
        input.words.data(), block_start_kmer, block_kmers, word_tile, sequence_tile
    );
 
    insert_kmers_from_symbol_tile<Config, warps_per_block>(
        sequence_tile, block_start_kmer, block_kmers, block_all_valid, shards, reduce_storage
    );
}

Here is the call graph for this function:

◆ insert_kmers_from_symbol_tile()

template<typename Config , uint32_t warps_per_block>

__device__ __forceinline__ void cusbf::detail::insert_kmers_from_symbol_tile	(	const uint8_t *	sequence_tile,
		uint64_t	block_start_kmer,
		uint64_t	block_kmers,
		bool	block_all_valid,
		device_span< filter_block< Config > >	shards,
		cub::WarpReduce< uint64_t >::TempStorage	reduce_storage[warps_per_block][4]
	)

Shared insert path after a block symbol tile has been prepared.

Used by both byte-sequence and dense-packed symbol kernels.

Definition at line 107 of file kernels.cuh.

  {
    constexpr uint32_t warp_size = 32;
 
    const auto local_kmer_index = static_cast<uint64_t>(threadIdx.x);
    const bool in_range = local_kmer_index < block_kmers;
 
    bool active = in_range;
    if (active && !block_all_valid) {
        active = kmer_is_valid<Config>(sequence_tile, local_kmer_index);
    }
 
    uint64_t minimizer_hash = 0;
    uint64_t word_mask0 = 0;
    uint64_t word_mask1 = 0;
    uint64_t word_mask2 = 0;
    uint64_t word_mask3 = 0;
 
    if (active) {
        const uint64_t packed_kmer =
            pack_kmer_from_tile<Config, Config::k>(sequence_tile, local_kmer_index);
        minimizer_hash = packed_kmer_minimizer_hash<Config>(packed_kmer);
 
        uint64_t h_s = packed_kmer_smer_hash<Config>(packed_kmer, 0);
        filter_block<Config>::sectorizedHashToMasks(
            h_s, word_mask0, word_mask1, word_mask2, word_mask3
        );
        _Pragma("unroll")
        for (uint64_t smer_offset = 1; smer_offset < Config::findereSpan; ++smer_offset) {
            h_s = packed_kmer_smer_hash<Config>(packed_kmer, smer_offset);
            filter_block<Config>::sectorizedHashToMasks(
                h_s, word_mask0, word_mask1, word_mask2, word_mask3
            );
        }
    }
 
    const auto shard_idx =
        static_cast<uint32_t>(active ? (minimizer_hash & (shards.size() - 1)) : ~threadIdx.x);
 
    const uint32_t lane = threadIdx.x & (warp_size - 1);
    const uint32_t warp_idx = threadIdx.x / warp_size;
    const uint32_t prev_shard_idx = __shfl_up_sync(0xffffffff, shard_idx, 1);
    const bool run_head = (lane == 0) || (shard_idx != prev_shard_idx);
    const BitwiseOr<uint64_t> bitwise_or{};
 
    using WarpReduceWord = cub::WarpReduce<uint64_t>;
    word_mask0 = WarpReduceWord(reduce_storage[warp_idx][0])
                     .HeadSegmentedReduce(word_mask0, run_head, bitwise_or);
    word_mask1 = WarpReduceWord(reduce_storage[warp_idx][1])
                     .HeadSegmentedReduce(word_mask1, run_head, bitwise_or);
    word_mask2 = WarpReduceWord(reduce_storage[warp_idx][2])
                     .HeadSegmentedReduce(word_mask2, run_head, bitwise_or);
    word_mask3 = WarpReduceWord(reduce_storage[warp_idx][3])
                     .HeadSegmentedReduce(word_mask3, run_head, bitwise_or);
 
    if (run_head && active) {
        filter_ref<Config> ref;
        ref.apply_word_masks(shards[shard_idx], word_mask0, word_mask1, word_mask2, word_mask3);
    }
}

Here is the call graph for this function:

◆ insert_sequence_kmers_kernel()

template<typename Config >

__global__ void cusbf::detail::insert_sequence_kmers_kernel	(	SequenceKmerInput< Config >	input,
		device_span< filter_block< Config > >	shards
	)

Insert kernel: sectorized Bloom updates grouped by minimizer shard.

Warp-local segmented reduction merges consecutive k-mers targeting the same shard.

Definition at line 215 of file kernels.cuh.

  {
    constexpr uint64_t sequence_tile_bases = Config::cudaBlockSize + Config::k - 1;
    constexpr uint32_t warps_per_block = Config::cudaBlockSize / 32;
 
    using WarpReduceWord = cub::WarpReduce<uint64_t>;
 
    __shared__ uint8_t sequence_tile[sequence_tile_bases];
    __shared__ typename WarpReduceWord::TempStorage reduce_storage[warps_per_block][4];
 
    const uint64_t num_kmers = input.kmerCount();
    const uint64_t block_start_kmer = static_cast<uint64_t>(blockIdx.x) * Config::cudaBlockSize;
    if (block_start_kmer >= num_kmers) {
        return;
    }
 
    const uint64_t block_kmers = min(Config::cudaBlockSize, num_kmers - block_start_kmer);
 
    const bool block_all_valid = prepare_sequence_hash_tiles<Config>(
        input.sequence.data(), block_start_kmer, block_kmers, sequence_tile
    );
 
    insert_kmers_from_symbol_tile<Config, warps_per_block>(
        sequence_tile, block_start_kmer, block_kmers, block_all_valid, shards, reduce_storage
    );
}

Here is the call graph for this function:

◆ isGzipFile()

bool cusbf::detail::isGzipFile ( const std::filesystem::path & path )

inline

True when path begins with the gzip magic bytes (0x1F, 0x8B).

Definition at line 101 of file gzstreambuf.hpp.

                                                      {
    FILE* f = std::fopen(path.string().c_str(), "rb");
    if (!f) {
        return false;
    }
    uint8_t magic[2];
    size_t n = std::fread(magic, 1, 2, f);
    std::fclose(f);
    return n == 2 && magic[0] == 0x1F && magic[1] == 0x8B;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ kmer_is_valid()

template<typename Config >

__device__ __forceinline__ bool cusbf::detail::kmer_is_valid	(	const uint8_t *	tile,
		uint64_t	start
	)

True when no symbol in the k-mer window is the alphabet invalid sentinel.

Definition at line 93 of file sequence_kmer.cuh.

                                                                                   {
    _Pragma("unroll")
    for (uint64_t i = 0; i < Config::k; ++i) {
        if (tile[start + i] == Config::Alphabet::invalidSymbol) {
            return false;
        }
    }
    return true;
}

Here is the call graph for this function:

◆ load128BitGlobalNC()

__device__ __forceinline__ void cusbf::detail::load128BitGlobalNC	(	const uint64_t *	ptr,
		uint64_t &	out0,
		uint64_t &	out1
	)

Loads 128 bits from global memory using the non-coherent cache path.

Uses the ld.global.nc.v2.u64 instruction for uint64_t

Definition at line 74 of file helpers.cuh.

                                                                        {
    asm volatile("ld.global.nc.v2.u64 {%0, %1}, [%2];" : "=l"(out0), "=l"(out1) : "l"(ptr));
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ load256BitGlobalNC() [1/2]

template<typename T >

__device__ __forceinline__ void cusbf::detail::load256BitGlobalNC	(	const T *	ptr,
		T *	out
	)

Loads 256 bits from global memory using the non-coherent cache path.

This function uses inline PTX for 256-bit vectorized loads. For uint64_t: loads 4 values (v4.u64) For uint32_t: loads 8 values (v8.u32)

Note: Only available on sm_100+ architectures with PTX 8.8. Use __CUDA_ARCH__ >= 1000 guard at call sites.

Template Parameters

T	Element type (uint32_t or uint64_t)

Parameters

ptr	Source pointer (must be 32-byte aligned)
out	Output array (4 elements for uint64_t, 8 for uint32_t)

Definition at line 32 of file helpers.cuh.

                                                                         {
    static_assert(sizeof(T) == 4 || sizeof(T) == 8, "T must be uint32_t or uint64_t");
 
    if constexpr (sizeof(T) == 8) {
        asm volatile("ld.global.nc.v4.u64 {%0, %1, %2, %3}, [%4];"
                     : "=l"(out[0]), "=l"(out[1]), "=l"(out[2]), "=l"(out[3])
                     : "l"(ptr));
    } else {
        asm volatile("ld.global.nc.v8.u32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
                     : "=r"(out[0]),
                       "=r"(out[1]),
                       "=r"(out[2]),
                       "=r"(out[3]),
                       "=r"(out[4]),
                       "=r"(out[5]),
                       "=r"(out[6]),
                       "=r"(out[7])
                     : "l"(ptr));
    }
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ load256BitGlobalNC() [2/2]

__device__ __forceinline__ void cusbf::detail::load256BitGlobalNC	(	const uint64_t *	ptr,
		uint64_t &	out0,
		uint64_t &	out1,
		uint64_t &	out2,
		uint64_t &	out3
	)

Loads four uint64_t words via ld.global.nc.v4.u64 (sm_100+ only).

Definition at line 54 of file helpers.cuh.

  {
    asm volatile("ld.global.nc.v4.u64 {%0, %1, %2, %3}, [%4];"
                 : "=l"(out0), "=l"(out1), "=l"(out2), "=l"(out3)
                 : "l"(ptr));
}

Here is the call graph for this function:

◆ load_shard_words4()

template<typename Config >

__device__ __forceinline__ void cusbf::detail::load_shard_words4	(	const filter_block< Config > *	shards,
		uint64_t	shard_index,
		uint64_t *	w
	)

Loads four 64-bit shard words with 256-bit (sm_100+) or 128-bit vector loads.

Definition at line 64 of file sequence_kmer.cuh.

                                                                                         {
#if __CUDA_ARCH__ >= 1000
    load256BitGlobalNC(shards[shard_index].words, w[0], w[1], w[2], w[3]);
#else
    load128BitGlobalNC(shards[shard_index].words + 0, w[0], w[1]);
    load128BitGlobalNC(shards[shard_index].words + 2, w[2], w[3]);
#endif
}

Here is the call graph for this function:

◆ maxOccupancyGridSize()

template<typename Kernel >

uint64_t cusbf::detail::maxOccupancyGridSize	(	int32_t	blockSize,
		Kernel	kernel,
		uint64_t	dynamicSMemSize
	)

Calculates the maximum occupancy grid size for a kernel.

Template Parameters

Kernel Type of the kernel function.

Parameters

blockSize	Block size (threads per block).
kernel	The kernel function.
dynamicSMemSize	Dynamic shared memory size per block.

Returns: uint64_t The calculated grid size (number of blocks).

Definition at line 115 of file helpers.cuh.

                                                                                          {
    int device = 0;
    cudaGetDevice(&device);
 
    int numSM = -1;
    cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device);
 
    int maxActiveBlocksPerSM{};
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &maxActiveBlocksPerSM, kernel, blockSize, dynamicSMemSize
    );
 
    return maxActiveBlocksPerSM * numSM;
}

Here is the call graph for this function:

◆ minimizer_hash64()

constexpr __host__ __device__ __forceinline__ uint64_t cusbf::detail::minimizer_hash64 ( uint64_t key )

constexpr

Fast 64-bit hash sufficient for uniform minimizer selection.

A single Knuth multiplicative step — provides enough uniformity for shard selection without the full avalanche quality of hash64.

Parameters

key	Packed m-mer input.

Returns: Hash value used to select the minimum (minimizer).

Definition at line 207 of file hashutil.cuh.

                                                                                      {
    return key * 0x9E3779B97F4A7C15ULL;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ multiplicativeSaltLiteral()

template<uint64_t Index>

__host__ __device__ __forceinline__ constexpr uint64_t cusbf::detail::multiplicativeSaltLiteral ( )

constexpr

Compile-time multiplicative salt for Bloom hash index Index.

Definition at line 97 of file filter_common.cuh.

                                                                                                 {
    static_assert(Index < 16, "Salt index out of range");
    return SaltLiteral<Index>::value;
}

Here is the call graph for this function:

◆ openFastxFile()

Result< std::unique_ptr< std::istream > > cusbf::detail::openFastxFile ( const std::filesystem::path & path )

inline

Opens a FASTA/FASTQ file for reading.

Parameters

path	File path.

Returns: Open input file stream, or an I/O error.

Definition at line 470 of file Fastx.hpp.

  {
    if (isGzipFile(path)) {
        return GzIstream::open(path);
    }
    auto input = std::make_unique<std::ifstream>(path);
    if (!input->is_open()) {
        return Err(Error::io(std::format("Failed to open FASTA/FASTQ file: {}", path.string())));
    }
    return input;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ pack_dense_sequence_kernel()

template<typename Config >

__global__ void cusbf::detail::pack_dense_sequence_kernel	(	const char *	sequence,
		uint64_t	num_symbols,
		uint64_t *	words
	)

Packs an encoded byte sequence into dense symbolBits-wide uint64_t words.

One thread per output word; reads Config::symbolWidth bytes per symbol via Config::Alphabet::encode.

Definition at line 128 of file dense_packed.cuh.

                                                                                        {
    constexpr uint64_t symbols_per_word = dense_packed_symbols_per_word<Config>();
    const uint64_t word_index = static_cast<uint64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
    const uint64_t num_words = dense_packed_word_count<Config>(num_symbols);
    if (word_index >= num_words) {
        return;
    }
 
    const uint64_t first_symbol = word_index * symbols_per_word;
    uint64_t packed_word = 0;
    for (uint64_t offset = 0; offset < symbols_per_word; ++offset) {
        const uint64_t symbol_index = first_symbol + offset;
        if (symbol_index >= num_symbols) {
            break;
        }
        const uint8_t symbol =
            Config::Alphabet::encode(sequence + symbol_index * Config::symbolWidth);
        packed_word |=
            (static_cast<uint64_t>(symbol & Config::symbolMask) << (offset * Config::symbolBits));
    }
    words[word_index] = packed_word;
}

Here is the call graph for this function:

◆ pack_kmer_from_tile()

template<typename Config , uint64_t K>

__device__ __forceinline__ uint64_t cusbf::detail::pack_kmer_from_tile	(	const uint8_t *	tile,
		uint64_t	start
	)

Packs K encoded symbols from a shared-memory tile starting at start.

Definition at line 75 of file sequence_kmer.cuh.

                                                                                             {
    uint64_t packed = 0;
    _Pragma("unroll")
    for (uint64_t i = 0; i < K; ++i) {
        packed = (packed << Config::symbolBits) | (tile[start + i] & Config::symbolMask);
    }
    return packed;
}

Here is the call graph for this function:

◆ packed_kmer_minimizer_hash()

template<typename Config >

__device__ __forceinline__ uint64_t cusbf::detail::packed_kmer_minimizer_hash ( uint64_t packed_kmer )

Minimum minimizer hash over all m-mers in a packed k-mer.

Definition at line 41 of file sequence_kmer.cuh.

                                                                                                   {
    uint64_t minimizer_hash = kInvalidHash;
    _Pragma("unroll")
    for (uint64_t offset = 0; offset < Config::minimizerSpan; ++offset) {
        const uint64_t packed_mmer =
            extractPackedSubwindow<Config, Config::m, Config::k>(packed_kmer, offset);
        minimizer_hash = min(minimizer_hash, minimizer_hash64(packed_mmer));
    }
    return minimizer_hash;
}

Here is the call graph for this function:

◆ packed_kmer_smer_hash()

template<typename Config >

__device__ __forceinline__ uint64_t cusbf::detail::packed_kmer_smer_hash	(	uint64_t	packed_kmer,
		uint64_t	start
	)

Bloom hash for the s-mer at start within a packed k-mer.

Definition at line 55 of file sequence_kmer.cuh.

                                                            {
    const uint64_t packed_smer =
        extractPackedSubwindow<Config, Config::s, Config::k>(packed_kmer, start);
    return hash64(packed_smer);
}

Here is the call graph for this function:

◆ packedWindowMask()

template<typename Config , uint64_t Length>

__host__ __device__ __forceinline__ constexpr uint64_t cusbf::detail::packedWindowMask ( )

constexpr

Bit mask retaining the low Length symbols of a packed k-mer.

Definition at line 119 of file filter_common.cuh.

                                                                                        {
    if constexpr (Length * Config::symbolBits >= 64) {
        return std::numeric_limits<uint64_t>::max();
    } else {
        return (uint64_t{1} << (Config::symbolBits * Length)) - 1;
    }
}

Here is the call graph for this function:

◆ parse_env_mebibytes()

uint64_t cusbf::detail::parse_env_mebibytes ( std::string_view value )

inline

Parses a decimal mebibyte count from value.

Accepts a leading decimal prefix (same spirit as strtoull). Returns 0 when value is empty or does not start with digits.

Definition at line 16 of file host_parse.hpp.

                                                                        {
    if (value.empty()) {
        return 0;
    }
 
    uint64_t mebibytes = 0;
    const auto* begin = value.data();
    const auto* end = begin + value.size();
    const auto [ptr, ec] = std::from_chars(begin, end, mebibytes);
    if (ec != std::errc{} || ptr == begin) {
        return 0;
    }
    return mebibytes;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ parse_host_chunk_max_bytes()

size_t cusbf::detail::parse_host_chunk_max_bytes ( const char * env_name )

inline

Definition at line 14 of file fastx_host_limits.cuh.

                                                                             {
    const uint64_t mebibytes = parse_env_mebibytes(getenv_value(env_name));
    if (mebibytes == 0) {
        return 0;
    }
    return static_cast<size_t>(mebibytes) << 20;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ prepare_dense_packed_tiles()

template<typename Config >

__device__ __forceinline__ bool cusbf::detail::prepare_dense_packed_tiles	(	const uint64_t *	words,
		uint64_t	block_start_kmer,
		uint64_t	block_kmers,
		uint64_t *	word_tile,
		uint8_t *	sequence_tile
	)

Fills sequence_tile with encoded symbols for k-mers starting at block_start_kmer.

Cooperatively loads the covering uint64_t words into word_tile, then decodes symbols from shared memory to avoid repeated global loads across overlapping k-mers.

Returns: true when every symbol in the tile is valid.

Definition at line 90 of file dense_packed.cuh.

  {
    constexpr uint64_t symbols_per_word = dense_packed_symbols_per_word<Config>();
    const uint64_t tile_symbols = block_kmers + Config::k - 1;
    const uint64_t first_symbol = block_start_kmer;
    const uint64_t last_symbol = block_start_kmer + tile_symbols - 1;
    const uint64_t first_word = first_symbol / symbols_per_word;
    const uint64_t last_word = last_symbol / symbols_per_word;
    const uint64_t num_words = last_word - first_word + 1;
 
    for (uint64_t idx = threadIdx.x; idx < num_words; idx += Config::cudaBlockSize) {
        word_tile[idx] = words[first_word + idx];
    }
    __syncthreads();
 
    bool local_invalid_symbol = false;
    for (uint64_t idx = threadIdx.x; idx < tile_symbols; idx += Config::cudaBlockSize) {
        const uint8_t encoded =
            dense_packed_symbol_at_local<Config>(word_tile, first_word, first_symbol + idx);
        sequence_tile[idx] = encoded;
        local_invalid_symbol |= (encoded == Config::Alphabet::invalidSymbol);
    }
    return __syncthreads_count(local_invalid_symbol) == 0;
}

Here is the call graph for this function:

◆ prepare_sequence_hash_tiles()

template<typename Config >

__device__ __forceinline__ bool cusbf::detail::prepare_sequence_hash_tiles	(	const char *	sequence,
		uint64_t	block_start_kmer,
		uint64_t	block_kmers,
		uint8_t *	sequence_tile
	)

Encodes a block's sequence slice into sequence_tile and reports global validity.

Returns: true when every encoded base in the tile is valid.

Definition at line 109 of file sequence_kmer.cuh.

  {
    const uint64_t tile_bases = block_kmers + Config::k - 1;
 
    bool local_invalid_base = false;
    for (uint64_t idx = threadIdx.x; idx < tile_bases; idx += Config::cudaBlockSize) {
        const uint8_t encoded_base =
            Config::Alphabet::encode(sequence + (block_start_kmer + idx) * Config::symbolWidth);
        sequence_tile[idx] = encoded_base;
        local_invalid_base |= (encoded_base == Config::Alphabet::invalidSymbol);
    }
    return __syncthreads_count(local_invalid_base) == 0;
}

Here is the call graph for this function:

◆ propagate_error()

cuda::std::unexpected< Error > cusbf::detail::propagate_error ( const Error & error )

inline

Copies error for propagation (avoids moving out of expected::error()).

Definition at line 212 of file error.hpp.

                                                                                  {
    return cuda::std::unexpected<Error>(Error{error});
}

Here is the call graph for this function:

◆ query_available_host_bytes()

size_t cusbf::detail::query_available_host_bytes ( )

inline

Available physical RAM (bytes) for mmap budgeting.

Definition at line 36 of file fastx_host_memory.cuh.

                                                         {
#if defined(__linux__)
    const long page_size = ::sysconf(_SC_PAGESIZE);
    const long avail_pages = ::sysconf(_SC_AVPHYS_PAGES);
    if (page_size > 0 && avail_pages > 0) {
        return static_cast<size_t>(page_size) * static_cast<size_t>(avail_pages);
    }
#endif
    return 0;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ query_cuda_free_memory()

Result< cuda_free_memory > cusbf::detail::query_cuda_free_memory ( )

inline

Queries current device free memory via cudaMemGetInfo.

Definition at line 33 of file fastx_chunk.cuh.

                                                                       {
    size_t free_bytes = 0;
    size_t total_bytes = 0;
    const cudaError_t error = cudaMemGetInfo(&free_bytes, &total_bytes);
    if (error != cudaSuccess) {
        return Err(Error::io(std::format("cudaMemGetInfo failed: {}", cudaGetErrorString(error))));
    }
    return cuda_free_memory{free_bytes};
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ record_kmer_count()

template<typename Config >

constexpr __host__ __device__ uint64_t cusbf::detail::record_kmer_count ( uint64_t bases )

constexprnoexcept

Definition at line 13 of file record_math.cuh.

                                                                                                {
    const uint64_t symbols = record_symbol_count<Config>(bases);
    return symbols < Config::k ? 0 : symbols - Config::k + 1;
}

Here is the call graph for this function:

◆ record_symbol_count()

template<typename Config >

constexpr __host__ __device__ uint64_t cusbf::detail::record_symbol_count ( uint64_t bases )

constexprnoexcept

Definition at line 8 of file record_math.cuh.

                                                                                                  {
    return bases / Config::symbolWidth;
}

Here is the call graph for this function:

◆ run_fastx_pipeline()

template<typename Config , typename FastxReaderType , typename Adapter >

Result< typename Adapter::report_type > cusbf::detail::run_fastx_pipeline	(	FastxReaderType &	reader,
		std::string_view	source_name,
		double	fill_fraction,
		cuda::stream_ref	stream,
		fastx_dispatch_path	dispatch_path,
		FastxPipelineState &	state,
		Adapter &&	adapter
	)

Definition at line 33 of file fastx_pipeline_core.cuh.

  {
    FastxPipelineReleaseGuard release_guard{state};
 
    const auto gpu_memory = query_cuda_free_memory();
    if (!gpu_memory) {
        return Err(Error::resource(gpu_memory.error().message()));
    }
 
    const size_t staging_budget_bytes =
        fastx_staging_budget_bytes<Config>(fill_fraction, gpu_memory->free_bytes);
    const size_t host_chunk_max_bytes = fastx_host_chunk_max_bytes();
    const uint64_t sequence_reserve_bytes =
        fastx_uses_mmap_reader(dispatch_path) ? fastx_file_bytes(source_name) : 0;
 
    DenseRecordBatchBuilder chunk(sequence_reserve_bytes);
    FastxRecord record;
 
    auto collect_all = [&](auto&& maybe_flush) -> Result<void> {
        for (;;) {
            const uint64_t local_index = chunk.recordCount();
            if (!CUSBF_TRY(collect_next_fastx_record(reader, record, chunk))) {
                break;
            }
            adapter.on_record_collected(record, local_index, chunk);
            CUSBF_TRY(maybe_flush());
        }
        return {};
    };
 
    if (fastx_is_single_chunk_path(dispatch_path)) {
        CUSBF_TRY(collect_all([&]() -> Result<void> { return {}; }));
        CUSBF_TRY(adapter.flush_sync(chunk, stream));
        return adapter.finish();
    }
 
    if (stream.get() == nullptr && adapter.supports_pipelined()) {
        const size_t pipelined_chunk_budget =
            fastx_pipelined_chunk_budget(adapter.chunk_mode(), staging_budget_bytes);
        ChunkStreamPair chunk_streams;
        size_t ping = 0;
        bool has_inflight = false;
 
        CUSBF_TRY(collect_all([&]() -> Result<void> {
            if (!fastx_chunk_should_flush<Config>(
                    adapter.chunk_mode(),
                    pipelined_chunk_budget,
                    host_chunk_max_bytes,
                    chunk.raw_sequence_bytes(),
                    chunk.recordCount()
                )) {
                return {};
            }
            return adapter.flush_pipelined(chunk, chunk_streams, ping, has_inflight);
        }));
 
        CUSBF_TRY(adapter.flush_pipelined(chunk, chunk_streams, ping, has_inflight));
        CUSBF_TRY(chunk_streams.sync_all());
        CUSBF_TRY(adapter.finish_pipelined(chunk_streams, ping, has_inflight));
        return adapter.finish();
    }
 
    const size_t sync_chunk_budget =
        stream.get() == nullptr && !adapter.supports_pipelined()
            ? fastx_pipelined_chunk_budget(adapter.chunk_mode(), staging_budget_bytes)
            : staging_budget_bytes;
 
    CUSBF_TRY(collect_all([&]() -> Result<void> {
        if (!fastx_chunk_should_flush<Config>(
                adapter.chunk_mode(),
                sync_chunk_budget,
                host_chunk_max_bytes,
                chunk.raw_sequence_bytes(),
                chunk.recordCount()
            )) {
            return {};
        }
        return adapter.flush_sync(chunk, stream);
    }));
 
    CUSBF_TRY(adapter.flush_sync(chunk, stream));
    return adapter.finish();
}

Here is the call graph for this function:

◆ select_fastx_dispatch_path()

template<typename Config >

fastx_dispatch_path cusbf::detail::select_fastx_dispatch_path	(	const std::filesystem::path &	path,
		fastx_chunk_mode	mode,
		double	fill_fraction
	)

inline

Selects mmap vs stream and single- vs multi-chunk processing for a path.

Definition at line 114 of file fastx_dispatch.hpp.

  {
    if (isGzipFile(path)) {
        return fastx_dispatch_path::chunked_stream;
    }
 
    return select_fastx_dispatch_path_for_file_bytes<Config>(
        fastx_file_bytes(path), mode, fill_fraction, fastx_file_fits_in_memory(path)
    );
}

Here is the call graph for this function:

◆ select_fastx_dispatch_path_for_file_bytes()

template<typename Config >

fastx_dispatch_path cusbf::detail::select_fastx_dispatch_path_for_file_bytes	(	uint64_t	file_bytes,
		fastx_chunk_mode	mode,
		double	fill_fraction,
		bool	file_fits_in_memory
	)

inline

Selects mmap vs stream and single- vs multi-chunk processing from file size.

Definition at line 89 of file fastx_dispatch.hpp.

  {
    if (file_bytes > 0 && fastx_fits_single_gpu_chunk<Config>(mode, fill_fraction, file_bytes) &&
        file_bytes <= fastx_single_chunk_stream_max_bytes()) {
        return fastx_dispatch_path::single_chunk_stream;
    }
 
    if (file_bytes > 0 && fastx_fits_single_gpu_chunk<Config>(mode, fill_fraction, file_bytes) &&
        file_fits_in_memory) {
        return fastx_dispatch_path::single_chunk_mmap;
    }
 
    if (file_fits_in_memory) {
        return fastx_dispatch_path::chunked_mmap;
    }
 
    return fastx_dispatch_path::chunked_stream;
}

Here is the call graph for this function:

◆ separatorByteAlwaysEncodesInvalid()

template<typename T >

consteval bool cusbf::detail::separatorByteAlwaysEncodesInvalid ( )

Tests that for every position in the input, placing the separator byte at that position always results in an invalid encoding.

This is a necessary condition for the separator to function correctly when concatenating sequences, as it prevents the creation of valid symbols that span across sequence boundaries.

Template Parameters

T	Alphabet type to test.

Returns: bool True if the separator byte always produces an invalid encoding at every position, false if any position allows the separator to be part of a valid encoding.

Definition at line 67 of file Alphabet.cuh.

                                                   {
    for (uint64_t separatorPosition = 0; separatorPosition < T::symbolWidth; ++separatorPosition) {
        char input[T::symbolWidth]{};
        if (!separatorPositionAlwaysEncodesInvalid<T>(input, separatorPosition, 0)) {
            return false;
        }
    }
    return true;
}

Here is the call graph for this function:

◆ separatorPositionAlwaysEncodesInvalid()

template<typename T >

consteval bool cusbf::detail::separatorPositionAlwaysEncodesInvalid	(	char *	input,
		uint64_t	separatorPosition,
		uint64_t	index
	)

Recursively tests whether placing the separator byte at any position in an input of valid bytes always results in an invalid encoding.

This ensures that the separator cannot be confused with valid symbols when concatenating sequences.

Template Parameters

T	Alphabet type to test.

Parameters

input	Buffer to construct input strings for encoding. Must have length at least `T::symbolWidth`.
separatorPosition	Position at which to place the separator byte in the input.
index	Current index being set in the input. Should be called with 0 initially.

Returns: bool True if the separator byte always produces an invalid encoding, false if any combination of valid bytes with the separator produces a valid encoding.

Definition at line 37 of file Alphabet.cuh.

                                                                                               {
    if (index == T::symbolWidth) {
        return T::encode(input) == static_cast<uint8_t>(T::invalidSymbol);
    }
 
    if (index == separatorPosition) {
        input[index] = static_cast<char>(T::separator);
        return separatorPositionAlwaysEncodesInvalid<T>(input, separatorPosition, index + 1);
    }
 
    for (uint64_t byteIndex = 0; byteIndex < validByteCount<T>(); ++byteIndex) {
        input[index] = T::validBytes[byteIndex];
        if (!separatorPositionAlwaysEncodesInvalid<T>(input, separatorPosition, index + 1)) {
            return false;
        }
    }
    return true;
}

Here is the call graph for this function:

◆ trimTrailingCarriageReturn()

void cusbf::detail::trimTrailingCarriageReturn ( std::string & line )

inline

Removes a trailing carriage return from line if present (Windows line endings).

Definition at line 261 of file Fastx.hpp.

                                                        {
    if (!line.empty() && line.back() == '\r') {
        line.pop_back();
    }
}

◆ try_unwrap_success() [1/2]

template<typename T >

T cusbf::detail::try_unwrap_success ( Result< T > & result )

Definition at line 203 of file error.hpp.

                                                      {
    return std::move(*result);
}

Here is the call graph for this function:

◆ try_unwrap_success() [2/2]

void cusbf::detail::try_unwrap_success ( Result< void > & result )

inline

Definition at line 207 of file error.hpp.

                                                     {
    (void)result;
}

Here is the call graph for this function:

◆ validate_fastx_staging_fits()

template<typename Config >

Result< void > cusbf::detail::validate_fastx_staging_fits	(	fastx_chunk_mode	mode,
		double	fill_fraction,
		uint64_t	raw_bytes,
		uint64_t	record_count,
		std::string_view	source_name
	)

inline

Returns: Resource error if raw_bytes / record_count exceed the GPU staging budget.

Definition at line 179 of file fastx_chunk.cuh.

  {
    const auto gpu_memory = query_cuda_free_memory();
    if (!gpu_memory) {
        return Err(gpu_memory.error());
    }
    const size_t staging_budget_bytes =
        fastx_staging_budget_bytes<Config>(fill_fraction, gpu_memory->free_bytes);
    if (!fastx_chunk_reached_staging_budget<Config>(
            mode, staging_budget_bytes, raw_bytes, record_count
        )) {
        return {};
    }
 
    return Err(
        Error::resource(
            std::format(
                "{}: FASTX input requires more GPU memory than available at fill_fraction={} "
                "(free staging budget {} bytes)",
                source_name,
                fill_fraction,
                staging_budget_bytes
            )
        )
    );
}

Here is the call graph for this function:

◆ validByteCount()

template<typename T >

consteval uint64_t cusbf::detail::validByteCount ( )

Definition at line 14 of file Alphabet.cuh.

                                    {
    uint64_t count = 0;
    while (T::validBytes[count] != '\0') {
        ++count;
    }
    return count;
}

◆ warpReduceOr()

__device__ __forceinline__ uint64_t cusbf::detail::warpReduceOr	(	uint32_t	peers,
		uint64_t	value
	)

OR-reduce a uint64_t across the lanes in a peer mask.

On sm_80+ uses __reduce_or_sync, on older architectures falls back to a shuffle-based reduction.

Definition at line 84 of file helpers.cuh.

                                                                                 {
#if __CUDA_ARCH__ >= 800
    auto lo = __reduce_or_sync(peers, static_cast<uint32_t>(value));
    auto hi = __reduce_or_sync(peers, static_cast<uint32_t>(value >> 32));
    return (static_cast<uint64_t>(hi) << 32) | lo;
#else
    // Shuffle-based reduction across the lanes set in `peers`.
    uint32_t remaining = peers;
    while (remaining) {
        int src = __ffs(remaining) - 1;
        uint64_t other =
            (static_cast<uint64_t>(__shfl_sync(peers, static_cast<uint32_t>(value >> 32), src))
             << 32) |
            __shfl_sync(peers, static_cast<uint32_t>(value), src);
        value |= other;
        remaining &= remaining - 1;  // clear lowest set bit
    }
    return value;
#endif
}

Here is the call graph for this function:

Variable Documentation

◆ kContainsSequenceStride

constexpr uint32_t cusbf::detail::kContainsSequenceStride = 4

inlineconstexpr

K-mers processed per query thread per inner loop iteration.

Definition at line 22 of file filter_common.cuh.

◆ kDefaultFastxHostRamSlackBytes

constexpr size_t cusbf::detail::kDefaultFastxHostRamSlackBytes = 4u << 30

inlineconstexpr

Default headroom left for the OS and other processes when sizing mmap.

Definition at line 20 of file fastx_host_memory.cuh.

◆ kInvalidHash

constexpr uint64_t cusbf::detail::kInvalidHash = std::numeric_limits<uint64_t>::max()

inlineconstexpr

Sentinel hash value indicating "no valid minimizer found".

Definition at line 25 of file filter_common.cuh.

Namespaces

Classes

Concepts

Typedefs

Enumerations

Functions

Variables

Typedef Documentation

◆ fastx_dispatch_handler_result_t

Enumeration Type Documentation

◆ fastx_chunk_mode

◆ fastx_dispatch_path

◆ FastxFormat

Function Documentation

◆ advance_packed_kmer()

◆ atomicOrWord()

◆ build_stride_kmer_valid_mask()

◆ collect_next_fastx_record()

◆ contains_dense_packed_kmers_kernel()

◆ contains_kmers_from_symbol_tile()

◆ contains_sequence_kmers_kernel()

◆ count_positive_kmers_per_record()

◆ count_positive_kmers_per_record_kernel()

◆ count_positive_kmers_total()

◆ dense_packed_insert_word_tile_capacity()

◆ dense_packed_kmer_count()

◆ dense_packed_query_word_tile_capacity()

◆ dense_packed_symbol_at()

◆ dense_packed_symbol_at_local()

◆ dense_packed_symbols_per_word()

◆ dense_packed_word_count()

◆ dispatch_fastx_file()

◆ estimate_insert_staging_bytes()

◆ estimate_normalized_sequence_bytes()

◆ estimate_query_staging_bytes()

◆ extractPackedSubwindow()

◆ fastx_chunk_reached_host_byte_limit()

◆ fastx_chunk_reached_staging_budget()

◆ fastx_chunk_should_flush()

◆ fastx_chunk_slack_bytes()

◆ fastx_column_at()

◆ fastx_file_bytes()

◆ fastx_file_fits_gpu_staging()

◆ fastx_file_fits_in_memory()

◆ fastx_file_supports_memory_map()

◆ fastx_fits_single_gpu_chunk()

◆ fastx_host_chunk_max_bytes()

◆ fastx_host_ram_slack_bytes()

◆ fastx_is_single_chunk_path()

◆ fastx_memory_map_max_bytes()

◆ fastx_pipelined_chunk_budget()

◆ fastx_quality_excess_column()

◆ fastx_quality_short_column()

◆ fastx_record_kmer_count()

◆ fastx_record_symbol_count()

◆ fastx_single_chunk_stream_max_bytes()

◆ fastx_staging_budget_bytes()

◆ fastx_uses_mmap_reader()

◆ forEachHashIndex()

◆ forEachHashIndexImpl()

◆ getenv_value()

◆ hash64()

◆ insert_dense_packed_kmers_kernel()

◆ insert_kmers_from_symbol_tile()

◆ insert_sequence_kmers_kernel()

◆ isGzipFile()

◆ kmer_is_valid()

◆ load128BitGlobalNC()

◆ load256BitGlobalNC() [1/2]

◆ load256BitGlobalNC() [2/2]

◆ load_shard_words4()

◆ maxOccupancyGridSize()

◆ minimizer_hash64()

◆ multiplicativeSaltLiteral()

◆ openFastxFile()

◆ pack_dense_sequence_kernel()

◆ pack_kmer_from_tile()

◆ packed_kmer_minimizer_hash()

◆ packed_kmer_smer_hash()

◆ packedWindowMask()