cuSBF
Loading...
Searching...
No Matches
Alphabet.cuh
Go to the documentation of this file.
1#pragma once
2
3#include <cuda/std/__bit/integral.h>
4#include <cuda_runtime.h>
5
6#include <concepts>
7#include <cstdint>
8
9namespace cusbf {
10
11namespace detail {
12
13template <typename T>
14consteval uint64_t validByteCount() {
15 uint64_t count = 0;
16 while (T::validBytes[count] != '\0') {
17 ++count;
18 }
19 return count;
20}
21
35template <typename T>
36consteval bool
37separatorPositionAlwaysEncodesInvalid(char* input, uint64_t separatorPosition, uint64_t index) {
38 if (index == T::symbolWidth) {
39 return T::encode(input) == static_cast<uint8_t>(T::invalidSymbol);
40 }
41
42 if (index == separatorPosition) {
43 input[index] = static_cast<char>(T::separator);
45 }
46
48 input[index] = T::validBytes[byteIndex];
50 return false;
51 }
52 }
53 return true;
54}
55
66template <typename T>
68 for (uint64_t separatorPosition = 0; separatorPosition < T::symbolWidth; ++separatorPosition) {
69 char input[T::symbolWidth]{};
71 return false;
72 }
73 }
74 return true;
75}
76
77} // namespace detail
78
94template <typename T>
95concept Alphabet = requires(const char* input) {
96 { T::symbolCount } -> std::convertible_to<uint64_t>;
97 { T::symbolWidth } -> std::convertible_to<uint64_t>;
98 { T::invalidSymbol } -> std::convertible_to<uint8_t>;
99 { T::separator } -> std::convertible_to<uint8_t>;
100 { T::validBytes } -> std::convertible_to<const char*>;
101 { T::encode(input) } -> std::same_as<uint8_t>;
102} && requires {
103 requires T::symbolCount > 0 && T::symbolCount <= 255;
104 requires T::symbolWidth > 0;
105 requires detail::validByteCount<T>() > 0;
107};
108
114 static constexpr uint64_t symbolWidth = 1;
115 static constexpr uint64_t symbolCount = 4;
116 static constexpr uint8_t invalidSymbol = 0xFFu;
117 static constexpr uint8_t separator = 'N';
118 static constexpr char validBytes[] = "ACGT";
119
120 [[nodiscard]] constexpr __host__ __device__ __forceinline__ static uint8_t encode(
121 const char* input
122 ) {
123 const auto byte = static_cast<uint8_t>(input[0]);
124 const uint8_t upper = byte & 0xDFu; // force upper for validation only
125 const uint8_t x = (byte >> 1u) & 3u; // A=0, C=1, T=2, G=3
126 const uint8_t valid = (upper == 'A') | (upper == 'C') | (upper == 'G') | (upper == 'T');
127 const uint8_t mask = -valid;
128 return (x & mask) | (invalidSymbol & ~mask);
129 }
130};
131
139 static constexpr uint64_t symbolWidth = 3;
140 static constexpr uint64_t symbolCount = 64;
141 static constexpr uint8_t invalidSymbol = 0xFFu;
142 static constexpr uint8_t separator = 'N';
143 static constexpr char validBytes[] = "ACGT";
144
145 [[nodiscard]] constexpr __host__ __device__ __forceinline__ static uint8_t encode(
146 const char* input
147 ) {
148 const uint8_t a = DnaAlphabet::encode(input + 0);
149 const uint8_t b = DnaAlphabet::encode(input + 1);
150 const uint8_t c = DnaAlphabet::encode(input + 2);
151 const uint8_t valid = (a != invalidSymbol) & (b != invalidSymbol) & (c != invalidSymbol);
152 const uint8_t packed = (a << 4u) | (b << 2u) | c;
153 const uint8_t mask = -valid;
154 return (packed & mask) | (invalidSymbol & ~mask);
155 }
156};
157
168 static constexpr uint64_t symbolWidth = 1;
169 static constexpr uint64_t symbolCount = 26;
170 static constexpr uint8_t invalidSymbol = 0xFFu;
171 static constexpr uint8_t separator = '*';
172 static constexpr char validBytes[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
173
174 [[nodiscard]] constexpr __host__ __device__ __forceinline__ static uint8_t encode(
175 const char* input
176 ) {
177 const auto byte = static_cast<uint8_t>(input[0]);
178 const uint8_t upper = byte & 0xDFu;
179 const uint8_t letterIndex = upper - 'A';
180 const uint8_t valid = letterIndex < 26;
181 const uint8_t mask = -valid;
182 return (letterIndex & mask) | (invalidSymbol & ~mask);
183 }
184};
185
186} // namespace cusbf
Concept for alphabet-like types used to encode bytes as symbol indices.
Definition Alphabet.cuh:95
consteval uint64_t validByteCount()
Definition Alphabet.cuh:14
consteval bool separatorByteAlwaysEncodesInvalid()
Tests that for every position in the input, placing the separator byte at that position always result...
Definition Alphabet.cuh:67
consteval bool separatorPositionAlwaysEncodesInvalid(char *input, uint64_t separatorPosition, uint64_t index)
Recursively tests whether placing the separator byte at any position in an input of valid bytes alway...
Definition Alphabet.cuh:37
An alphabet for encoding DNA sequences, consisting of the symbols A, C, G, and T.
Definition Alphabet.cuh:113
static constexpr uint8_t separator
Definition Alphabet.cuh:117
constexpr __host__ __device__ static __forceinline__ uint8_t encode(const char *input)
Definition Alphabet.cuh:120
static constexpr uint64_t symbolWidth
Definition Alphabet.cuh:114
static constexpr uint8_t invalidSymbol
Definition Alphabet.cuh:116
static constexpr uint64_t symbolCount
Definition Alphabet.cuh:115
static constexpr char validBytes[]
Definition Alphabet.cuh:118
An alphabet that encodes non-overlapping DNA triplets as single symbols.
Definition Alphabet.cuh:138
static constexpr uint8_t separator
Definition Alphabet.cuh:142
static constexpr uint64_t symbolCount
Definition Alphabet.cuh:140
static constexpr uint8_t invalidSymbol
Definition Alphabet.cuh:141
static constexpr char validBytes[]
Definition Alphabet.cuh:143
static constexpr uint64_t symbolWidth
Definition Alphabet.cuh:139
constexpr __host__ __device__ static __forceinline__ uint8_t encode(const char *input)
Definition Alphabet.cuh:145
An alphabet for encoding protein sequences, consisting of the 20 standard amino acids plus common amb...
Definition Alphabet.cuh:167
static constexpr uint64_t symbolWidth
Definition Alphabet.cuh:168
constexpr __host__ __device__ static __forceinline__ uint8_t encode(const char *input)
Definition Alphabet.cuh:174
static constexpr uint8_t separator
Definition Alphabet.cuh:171
static constexpr uint8_t invalidSymbol
Definition Alphabet.cuh:170
static constexpr char validBytes[]
Definition Alphabet.cuh:172
static constexpr uint64_t symbolCount
Definition Alphabet.cuh:169