cuSBF
Loading...
Searching...
No Matches
helpers.cuh
Go to the documentation of this file.
1#pragma once
2
3#include <cuda_runtime.h>
4
5#include <cuda/std/bit>
6#include <cuda/std/concepts>
7
8#include <cstddef>
9#include <cstdint>
10#include <stdexcept>
11#include <string>
12
13namespace cusbf {
14
18class CudaError : public std::runtime_error {
19 public:
20 CudaError(cudaError_t code, const char* file, int line)
21 : std::runtime_error(
22 std::string(file) + ":" + std::to_string(line) + " " + cudaGetErrorString(code)
23 ),
24 code_(code) {
25 }
26
27 [[nodiscard]] cudaError_t code() const noexcept {
28 return code_;
29 }
30
31 private:
32 cudaError_t code_;
33};
34
35} // namespace cusbf
36
37namespace cusbf::detail {
38
39#if __CUDA_ARCH__ >= 1000
40
55template <typename T>
56__device__ __forceinline__ void load256BitGlobalNC(const T* ptr, T* out) {
57 static_assert(sizeof(T) == 4 || sizeof(T) == 8, "T must be uint32_t or uint64_t");
58
59 if constexpr (sizeof(T) == 8) {
60 asm volatile("ld.global.nc.v4.u64 {%0, %1, %2, %3}, [%4];"
61 : "=l"(out[0]), "=l"(out[1]), "=l"(out[2]), "=l"(out[3])
62 : "l"(ptr));
63 } else {
64 asm volatile("ld.global.nc.v8.u32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
65 : "=r"(out[0]),
66 "=r"(out[1]),
67 "=r"(out[2]),
68 "=r"(out[3]),
69 "=r"(out[4]),
70 "=r"(out[5]),
71 "=r"(out[6]),
72 "=r"(out[7])
73 : "l"(ptr));
74 }
75}
76
77__device__ __forceinline__ void load256BitGlobalNC(
78 const uint64_t* ptr,
83) {
84 asm volatile("ld.global.nc.v4.u64 {%0, %1, %2, %3}, [%4];"
85 : "=l"(out0), "=l"(out1), "=l"(out2), "=l"(out3)
86 : "l"(ptr));
87}
88
89#endif
90
97load128BitGlobalNC(const uint64_t* ptr, uint64_t& out0, uint64_t& out1) {
98 asm volatile("ld.global.nc.v2.u64 {%0, %1}, [%2];" : "=l"(out0), "=l"(out1) : "l"(ptr));
99}
100
108#if __CUDA_ARCH__ >= 800
109 auto lo = __reduce_or_sync(peers, static_cast<uint32_t>(value));
110 auto hi = __reduce_or_sync(peers, static_cast<uint32_t>(value >> 32));
111 return (static_cast<uint64_t>(hi) << 32) | lo;
112#else
113 // Shuffle-based reduction across the lanes set in `peers`.
115 while (remaining) {
116 int src = __ffs(remaining) - 1;
118 (static_cast<uint64_t>(__shfl_sync(peers, static_cast<uint32_t>(value >> 32), src))
119 << 32) |
120 __shfl_sync(peers, static_cast<uint32_t>(value), src);
121 value |= other;
122 remaining &= remaining - 1; // clear lowest set bit
123 }
124 return value;
125#endif
126}
127
132#define CUSBF_CUDA_CALL(err) \
133 do { \
134 cudaError_t err_ = (err); \
135 if (err_ == cudaSuccess) [[likely]] { \
136 break; \
137 } \
138 throw cusbf::CudaError(err_, __FILE__, __LINE__); \
139 } while (0)
140
150template <typename Kernel>
152 int device = 0;
154
155 int numSM = -1;
157
161 );
162
164}
165
166} // namespace cusbf::detail
Exception thrown on CUDA runtime errors.
Definition helpers.cuh:18
CudaError(cudaError_t code, const char *file, int line)
Definition helpers.cuh:20
cudaError_t code() const noexcept
Definition helpers.cuh:27
consteval bool separatorPositionAlwaysEncodesInvalid(char *input, uint64_t separatorPosition, uint64_t index)
Recursively tests whether placing the separator byte at any position in an input of valid bytes alway...
Definition Alphabet.cuh:37