cuSBF
Loading...
Searching...
No Matches
include
cusbf
helpers.cuh
Go to the documentation of this file.
1
#pragma once
2
3
#include <cuda_runtime.h>
4
5
#include <cuda/std/bit>
6
#include <cuda/std/concepts>
7
8
#include <cstddef>
9
#include <cstdint>
10
#include <stdexcept>
11
#include <string>
12
13
namespace
cusbf
{
14
18
class
CudaError
:
public
std::runtime_error {
19
public
:
20
CudaError
(cudaError_t
code
,
const
char
* file,
int
line)
21
: std::runtime_error(
22
std::string(file) +
":"
+ std::to_string(line) +
" "
+ cudaGetErrorString(
code
)
23
),
24
code_(
code
) {
25
}
26
27
[[nodiscard]] cudaError_t
code
() const noexcept {
28
return
code_;
29
}
30
31
private
:
32
cudaError_t code_;
33
};
34
35
}
// namespace cusbf
36
37
namespace
cusbf::detail
{
38
39
#if __CUDA_ARCH__ >= 1000
40
55
template
<
typename
T>
56
__device__
__forceinline__
void
load256BitGlobalNC(
const
T
*
ptr
,
T
*
out
) {
57
static_assert
(
sizeof
(
T
) == 4 ||
sizeof
(
T
) == 8,
"T must be uint32_t or uint64_t"
);
58
59
if
constexpr
(
sizeof
(
T
) == 8) {
60
asm
volatile
(
"ld.global.nc.v4.u64 {%0, %1, %2, %3}, [%4];"
61
:
"=l"
(
out
[0]),
"=l"
(
out
[1]),
"=l"
(
out
[2]),
"=l"
(
out
[3])
62
:
"l"
(
ptr
));
63
}
else
{
64
asm
volatile
(
"ld.global.nc.v8.u32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
65
:
"=r"
(
out
[0]),
66
"=r"
(
out
[1]),
67
"=r"
(
out
[2]),
68
"=r"
(
out
[3]),
69
"=r"
(
out
[4]),
70
"=r"
(
out
[5]),
71
"=r"
(
out
[6]),
72
"=r"
(
out
[7])
73
:
"l"
(
ptr
));
74
}
75
}
76
77
__device__
__forceinline__
void
load256BitGlobalNC(
78
const
uint64_t
*
ptr
,
79
uint64_t
&
out0
,
80
uint64_t
&
out1
,
81
uint64_t
&
out2
,
82
uint64_t
&
out3
83
) {
84
asm
volatile
(
"ld.global.nc.v4.u64 {%0, %1, %2, %3}, [%4];"
85
:
"=l"
(
out0
),
"=l"
(
out1
),
"=l"
(
out2
),
"=l"
(
out3
)
86
:
"l"
(
ptr
));
87
}
88
89
#endif
90
96
__device__
__forceinline__
void
97
load128BitGlobalNC(
const
uint64_t
*
ptr
,
uint64_t
&
out0
,
uint64_t
&
out1
) {
98
asm
volatile
(
"ld.global.nc.v2.u64 {%0, %1}, [%2];"
:
"=l"
(
out0
),
"=l"
(
out1
) :
"l"
(
ptr
));
99
}
100
107
__device__
__forceinline__
uint64_t
warpReduceOr(
uint32_t
peers
,
uint64_t
value) {
108
#if __CUDA_ARCH__ >= 800
109
auto
lo
=
__reduce_or_sync
(
peers
,
static_cast<
uint32_t
>
(value));
110
auto
hi
=
__reduce_or_sync
(
peers
,
static_cast<
uint32_t
>
(value >> 32));
111
return
(
static_cast<
uint64_t
>
(
hi
) << 32) |
lo
;
112
#else
113
// Shuffle-based reduction across the lanes set in `peers`.
114
uint32_t
remaining
=
peers
;
115
while
(
remaining
) {
116
int
src
=
__ffs
(
remaining
) - 1;
117
uint64_t
other
=
118
(
static_cast<
uint64_t
>
(
__shfl_sync
(
peers
,
static_cast<
uint32_t
>
(value >> 32),
src
))
119
<< 32) |
120
__shfl_sync
(
peers
,
static_cast<
uint32_t
>
(value),
src
);
121
value |=
other
;
122
remaining
&=
remaining
- 1;
// clear lowest set bit
123
}
124
return
value;
125
#endif
126
}
127
132
#define CUSBF_CUDA_CALL(err) \
133
do { \
134
cudaError_t err_ = (err); \
135
if (err_ == cudaSuccess) [[likely]] { \
136
break; \
137
} \
138
throw cusbf::CudaError(err_, __FILE__, __LINE__); \
139
} while (0)
140
150
template
<
typename
Kernel>
151
uint64_t
maxOccupancyGridSize(
int32_t
blockSize
,
Kernel
kernel
,
uint64_t
dynamicSMemSize
) {
152
int
device
= 0;
153
cudaGetDevice
(&
device
);
154
155
int
numSM
= -1;
156
cudaDeviceGetAttribute
(&
numSM
,
cudaDevAttrMultiProcessorCount
,
device
);
157
158
int
maxActiveBlocksPerSM
{};
159
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
160
&
maxActiveBlocksPerSM
,
kernel
,
blockSize
,
dynamicSMemSize
161
);
162
163
return
maxActiveBlocksPerSM
*
numSM
;
164
}
165
166
}
// namespace cusbf::detail
cusbf::CudaError
Exception thrown on CUDA runtime errors.
Definition
helpers.cuh:18
cusbf::CudaError::CudaError
CudaError(cudaError_t code, const char *file, int line)
Definition
helpers.cuh:20
cusbf::CudaError::code
cudaError_t code() const noexcept
Definition
helpers.cuh:27
cusbf::detail
Definition
Alphabet.cuh:11
cusbf::detail::separatorPositionAlwaysEncodesInvalid
consteval bool separatorPositionAlwaysEncodesInvalid(char *input, uint64_t separatorPosition, uint64_t index)
Recursively tests whether placing the separator byte at any position in an input of valid bytes alway...
Definition
Alphabet.cuh:37
cusbf
Definition
Alphabet.cuh:9
Generated by
1.9.8