1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
|
#include "String.hpp"
/*
#include <doctest/doctest.h>
*/
Utf8Iterator::Utf8Iterator(std::string_view::iterator it)
: mIter{ std::move(it) } {
}
constexpr unsigned char kFirstBitMask = 0b10000000;
constexpr unsigned char kSecondBitMask = 0b01000000;
constexpr unsigned char kThirdBitMask = 0b00100000;
constexpr unsigned char kFourthBitMask = 0b00010000;
constexpr unsigned char kFifthBitMask = 0b00001000;
Utf8Iterator& Utf8Iterator::operator++() {
char firstByte = *mIter;
std::string::difference_type offset = 1;
// This means the first byte has a value greater than 127, and so is beyond the ASCII range.
if (firstByte & kFirstBitMask) {
// This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point.
if (firstByte & kThirdBitMask) {
// This means that the first byte has a value greater than 240, and so it must be a four-octet code point.
if (firstByte & kFourthBitMask) {
offset = 4;
} else {
offset = 3;
}
} else {
offset = 2;
}
}
mIter += offset;
mDirty = true;
return *this;
}
Utf8Iterator Utf8Iterator::operator++(int) {
Utf8Iterator temp = *this;
++(*this);
return temp;
}
Utf8Iterator& Utf8Iterator::operator--() {
--mIter;
// This means that the previous byte is not an ASCII character.
if (*mIter & kFirstBitMask) {
--mIter;
if ((*mIter & kSecondBitMask) == 0) {
--mIter;
if ((*mIter & kSecondBitMask) == 0) {
--mIter;
}
}
}
mDirty = true;
return *this;
}
Utf8Iterator Utf8Iterator::operator--(int) {
Utf8Iterator temp = *this;
--(*this);
return temp;
}
char32_t Utf8Iterator::operator*() const {
UpdateCurrentValue();
return mCurrentCodePoint;
}
std::string_view::iterator Utf8Iterator::AsInternal() const {
// updateCurrentValue();
return mIter;
}
bool operator==(const Utf8Iterator& lhs, const Utf8Iterator& rhs) {
return lhs.mIter == rhs.mIter;
}
bool operator!=(const Utf8Iterator& lhs, const Utf8Iterator& rhs) {
return lhs.mIter != rhs.mIter;
}
bool operator==(const Utf8Iterator& lhs, std::string_view::iterator rhs) {
return lhs.mIter == rhs;
}
bool operator!=(const Utf8Iterator& lhs, std::string_view::iterator rhs) {
return lhs.mIter != rhs;
}
void Utf8Iterator::UpdateCurrentValue() const {
if (!mDirty) {
return;
}
mCurrentCodePoint = 0;
char firstByte = *mIter;
// This means the first byte has a value greater than 127, and so is beyond the ASCII range.
if (firstByte & kFirstBitMask) {
// This means that the first byte has a value greater than 191, and so it must be at least a three-octet code point.
if (firstByte & kThirdBitMask) {
// This means that the first byte has a value greater than 224, and so it must be a four-octet code point.
if (firstByte & kFourthBitMask) {
mCurrentCodePoint = (firstByte & 0x07) << 18;
char secondByte = *(mIter + 1);
mCurrentCodePoint += (secondByte & 0x3f) << 12;
char thirdByte = *(mIter + 2);
mCurrentCodePoint += (thirdByte & 0x3f) << 6;
char fourthByte = *(mIter + 3);
mCurrentCodePoint += (fourthByte & 0x3f);
} else {
mCurrentCodePoint = (firstByte & 0x0f) << 12;
char secondByte = *(mIter + 1);
mCurrentCodePoint += (secondByte & 0x3f) << 6;
char thirdByte = *(mIter + 2);
mCurrentCodePoint += (thirdByte & 0x3f);
}
} else {
mCurrentCodePoint = (firstByte & 0x1f) << 6;
char secondByte = *(mIter + 1);
mCurrentCodePoint += (secondByte & 0x3f);
}
} else {
mCurrentCodePoint = firstByte;
}
mDirty = true;
}
Utf8IterableString::Utf8IterableString(std::string_view str)
: mStr{ str } {
}
Utf8Iterator Utf8IterableString::begin() const {
return Utf8Iterator(mStr.begin());
}
Utf8Iterator Utf8IterableString::end() const {
return Utf8Iterator(mStr.end());
}
/*
TEST_CASE("Iterating ASCII string") {
std::string ascii("This is an ASCII string");
std::u32string output;
output.reserve(ascii.length());
for (char32_t c : Utf8IterableString(ascii)) {
output += c;
}
CHECK(output == U"This is an ASCII string");
}
// BMP: Basic Multilingual Plane
TEST_CASE("Iterating BMP string") {
std::string unicode("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
std::u32string output;
output.reserve(10);
for (char32_t c : Utf8IterableString(unicode)) {
output += c;
}
CHECK(output == U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
}
*/
std::u32string Utils::ConvertUtf8To32(std::string_view in) {
std::u32string str;
// Actual size cannot be smaller than this
str.reserve(in.size());
for (char32_t codepoint : Utf8IterableString(in)) {
str += codepoint;
}
return str;
}
std::string Utils::ConvertUtf32To8(std::u32string_view in) {
std::string str;
for (char32_t codepoint : in) {
if (codepoint <= 0x7F) {
str += codepoint;
} else if (codepoint <= 0x7FF) {
str += 0xC0 | (codepoint >> 6); // 110xxxxx
str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
} else if (codepoint <= 0xFFFF) {
str += 0xE0 | (codepoint >> 12); // 1110xxxx
str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx
str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
} else if (codepoint <= 0x10FFFF) {
str += 0xF0 | (codepoint >> 18); // 11110xxx
str += 0x80 | ((codepoint >> 12) & 0x3F); // 10xxxxxx
str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx
str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
}
}
return str;
}
/*
TEST_CASE("Utils::ConvertUtf32To8() with ASCII") {
auto output = Utils::ConvertUtf32To8(U"This is an ASCII string");
CHECK(output == "This is an ASCII string");
}
TEST_CASE("Utils::ConvertUtf32To8() with BMP codepoints") {
auto output = Utils::ConvertUtf32To8(U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
CHECK(output == "Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
}
*/
std::string_view Utils::SliceUtf8(std::string_view str, size_t begin, size_t end) {
const char* resBegin;
size_t resLength = 0;
Utf8Iterator it{ str.begin() };
size_t i = 0; // Nth codepoint on the string
// Skip until `it` points to the `begin`-th codepoint in the string
while (i < begin) {
i++;
it++;
} // Postcondition: i == begin
resBegin = &*it.AsInternal();
while (i < end) {
auto prev = it;
i++;
it++;
resLength += std::distance(prev.AsInternal(), it.AsInternal());
} // Postcondition: i == end
return { resBegin, resLength };
}
/*
TEST_CASE("Utils::CreateRange() with ASCII") {
auto a = Utils::CreateRange("This is an ASCII string", 1, 1 + 5);
std::string range(a);
CHECK(range == "his i");
}
TEST_CASE("Utils::CreateRange() with BMP codepoints") {
std::string range(Utils::SliceUtf8("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 11, 11 + 5));
CHECK(range == "t \u8FD9\u662F\u4E00");
}
*/
size_t Utils::CountUtf8Codepoints(std::string_view str) {
size_t result = 0;
for (char32_t _ : Utf8IterableString(str)) {
result++;
}
return result;
}
/*
TEST_CASE("Utils::GetLength() test") {
CHECK(Utils::GetLength("This is an ASCII string") == 23);
CHECK(Utils::CountUtf8Codepoints("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32") == 23);
}
Utils::CodepointInfo Utils::FindLastCodepoint(std::string_view str) {
Utf8Iterator it{ str.begin() };
Utf8Iterator prev{ it };
size_t codepoints = 0;
Utf8Iterator end{ str.end() };
while (it != end) {
codepoints++;
prev = it;
it++;
}
// it == end
// prev == <last codepoint in str>
return {
codepoints - 1,
(size_t)std::distance(str.begin(), prev.AsInternal()),
};
}
TEST_CASE("Utils::FindLastCodepoint() ASCII test") {
auto [index, byteOffset] = Utils::FindLastCodepoint("This is an ASCII string");
CHECK(index == 22);
CHECK(index == 22);
}
TEST_CASE("Utils::FindLastCodepoint() BMP test") {
auto [index, byteOffset] = Utils::FindLastCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
CHECK(index == 22);
CHECK(byteOffset == 40);
}
Utils::CodepointInfo Utils::FindCodepoint(std::string_view str, size_t codepointIdx) {
Utf8Iterator it{ str.begin() };
Utf8Iterator prev{ it };
size_t codepoint = 0;
Utf8Iterator end{ str.end() };
while (true) {
if (codepoint == codepointIdx) {
return { codepoint, (size_t)std::distance(str.begin(), it.AsInternal()) };
}
if (it == end) {
return { codepoint - 1, (size_t)std::distance(str.begin(), prev.AsInternal()) };
}
codepoint++;
prev = it;
it++;
}
}
TEST_CASE("Utils::FindCodepoint() ASCII test") {
auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 6);
CHECK(codepointOffset == 6);
CHECK(byteOffset == 6);
}
TEST_CASE("Utils::FindCodepoint() ASCII past-the-end test") {
auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 100);
CHECK(codepointOffset == 22);
CHECK(byteOffset == 22);
}
TEST_CASE("Utils::FindCodepoint() BMP test") {
auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 14);
CHECK(codepointOffset == 14);
CHECK(byteOffset == 16);
}
TEST_CASE("Utils::FindCodepoint() BMP past-the-end test") {
auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 100);
CHECK(codepointOffset == 22);
CHECK(byteOffset == 40);
}
*/
|