#include "String.hpp" /* #include */ Utf8Iterator::Utf8Iterator(std::string_view::iterator it) : mIter{ std::move(it) } { } constexpr unsigned char kFirstBitMask = 0b10000000; constexpr unsigned char kSecondBitMask = 0b01000000; constexpr unsigned char kThirdBitMask = 0b00100000; constexpr unsigned char kFourthBitMask = 0b00010000; constexpr unsigned char kFifthBitMask = 0b00001000; Utf8Iterator& Utf8Iterator::operator++() { char firstByte = *mIter; std::string::difference_type offset = 1; // This means the first byte has a value greater than 127, and so is beyond the ASCII range. if (firstByte & kFirstBitMask) { // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point. if (firstByte & kThirdBitMask) { // This means that the first byte has a value greater than 240, and so it must be a four-octet code point. if (firstByte & kFourthBitMask) { offset = 4; } else { offset = 3; } } else { offset = 2; } } mIter += offset; mDirty = true; return *this; } Utf8Iterator Utf8Iterator::operator++(int) { Utf8Iterator temp = *this; ++(*this); return temp; } Utf8Iterator& Utf8Iterator::operator--() { --mIter; // This means that the previous byte is not an ASCII character. if (*mIter & kFirstBitMask) { --mIter; if ((*mIter & kSecondBitMask) == 0) { --mIter; if ((*mIter & kSecondBitMask) == 0) { --mIter; } } } mDirty = true; return *this; } Utf8Iterator Utf8Iterator::operator--(int) { Utf8Iterator temp = *this; --(*this); return temp; } char32_t Utf8Iterator::operator*() const { UpdateCurrentValue(); return mCurrentCodePoint; } std::string_view::iterator Utf8Iterator::AsInternal() const { // updateCurrentValue(); return mIter; } bool operator==(const Utf8Iterator& lhs, const Utf8Iterator& rhs) { return lhs.mIter == rhs.mIter; } bool operator!=(const Utf8Iterator& lhs, const Utf8Iterator& rhs) { return lhs.mIter != rhs.mIter; } bool operator==(const Utf8Iterator& lhs, std::string_view::iterator rhs) { return lhs.mIter == rhs; } bool operator!=(const Utf8Iterator& lhs, std::string_view::iterator rhs) { return lhs.mIter != rhs; } void Utf8Iterator::UpdateCurrentValue() const { if (!mDirty) { return; } mCurrentCodePoint = 0; char firstByte = *mIter; // This means the first byte has a value greater than 127, and so is beyond the ASCII range. if (firstByte & kFirstBitMask) { // This means that the first byte has a value greater than 191, and so it must be at least a three-octet code point. if (firstByte & kThirdBitMask) { // This means that the first byte has a value greater than 224, and so it must be a four-octet code point. if (firstByte & kFourthBitMask) { mCurrentCodePoint = (firstByte & 0x07) << 18; char secondByte = *(mIter + 1); mCurrentCodePoint += (secondByte & 0x3f) << 12; char thirdByte = *(mIter + 2); mCurrentCodePoint += (thirdByte & 0x3f) << 6; char fourthByte = *(mIter + 3); mCurrentCodePoint += (fourthByte & 0x3f); } else { mCurrentCodePoint = (firstByte & 0x0f) << 12; char secondByte = *(mIter + 1); mCurrentCodePoint += (secondByte & 0x3f) << 6; char thirdByte = *(mIter + 2); mCurrentCodePoint += (thirdByte & 0x3f); } } else { mCurrentCodePoint = (firstByte & 0x1f) << 6; char secondByte = *(mIter + 1); mCurrentCodePoint += (secondByte & 0x3f); } } else { mCurrentCodePoint = firstByte; } mDirty = true; } Utf8IterableString::Utf8IterableString(std::string_view str) : mStr{ str } { } Utf8Iterator Utf8IterableString::begin() const { return Utf8Iterator(mStr.begin()); } Utf8Iterator Utf8IterableString::end() const { return Utf8Iterator(mStr.end()); } /* TEST_CASE("Iterating ASCII string") { std::string ascii("This is an ASCII string"); std::u32string output; output.reserve(ascii.length()); for (char32_t c : Utf8IterableString(ascii)) { output += c; } CHECK(output == U"This is an ASCII string"); } // BMP: Basic Multilingual Plane TEST_CASE("Iterating BMP string") { std::string unicode("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); std::u32string output; output.reserve(10); for (char32_t c : Utf8IterableString(unicode)) { output += c; } CHECK(output == U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); } */ std::u32string Utils::ConvertUtf8To32(std::string_view in) { std::u32string str; // Actual size cannot be smaller than this str.reserve(in.size()); for (char32_t codepoint : Utf8IterableString(in)) { str += codepoint; } return str; } std::string Utils::ConvertUtf32To8(std::u32string_view in) { std::string str; for (char32_t codepoint : in) { if (codepoint <= 0x7F) { str += codepoint; } else if (codepoint <= 0x7FF) { str += 0xC0 | (codepoint >> 6); // 110xxxxx str += 0x80 | (codepoint & 0x3F); // 10xxxxxx } else if (codepoint <= 0xFFFF) { str += 0xE0 | (codepoint >> 12); // 1110xxxx str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx str += 0x80 | (codepoint & 0x3F); // 10xxxxxx } else if (codepoint <= 0x10FFFF) { str += 0xF0 | (codepoint >> 18); // 11110xxx str += 0x80 | ((codepoint >> 12) & 0x3F); // 10xxxxxx str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx str += 0x80 | (codepoint & 0x3F); // 10xxxxxx } } return str; } /* TEST_CASE("Utils::ConvertUtf32To8() with ASCII") { auto output = Utils::ConvertUtf32To8(U"This is an ASCII string"); CHECK(output == "This is an ASCII string"); } TEST_CASE("Utils::ConvertUtf32To8() with BMP codepoints") { auto output = Utils::ConvertUtf32To8(U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); CHECK(output == "Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); } */ std::string_view Utils::SliceUtf8(std::string_view str, size_t begin, size_t end) { const char* resBegin; size_t resLength = 0; Utf8Iterator it{ str.begin() }; size_t i = 0; // Nth codepoint on the string // Skip until `it` points to the `begin`-th codepoint in the string while (i < begin) { i++; it++; } // Postcondition: i == begin resBegin = &*it.AsInternal(); while (i < end) { auto prev = it; i++; it++; resLength += std::distance(prev.AsInternal(), it.AsInternal()); } // Postcondition: i == end return { resBegin, resLength }; } /* TEST_CASE("Utils::CreateRange() with ASCII") { auto a = Utils::CreateRange("This is an ASCII string", 1, 1 + 5); std::string range(a); CHECK(range == "his i"); } TEST_CASE("Utils::CreateRange() with BMP codepoints") { std::string range(Utils::SliceUtf8("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 11, 11 + 5)); CHECK(range == "t \u8FD9\u662F\u4E00"); } */ size_t Utils::CountUtf8Codepoints(std::string_view str) { size_t result = 0; for (char32_t _ : Utf8IterableString(str)) { result++; } return result; } /* TEST_CASE("Utils::GetLength() test") { CHECK(Utils::GetLength("This is an ASCII string") == 23); CHECK(Utils::CountUtf8Codepoints("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32") == 23); } Utils::CodepointInfo Utils::FindLastCodepoint(std::string_view str) { Utf8Iterator it{ str.begin() }; Utf8Iterator prev{ it }; size_t codepoints = 0; Utf8Iterator end{ str.end() }; while (it != end) { codepoints++; prev = it; it++; } // it == end // prev == return { codepoints - 1, (size_t)std::distance(str.begin(), prev.AsInternal()), }; } TEST_CASE("Utils::FindLastCodepoint() ASCII test") { auto [index, byteOffset] = Utils::FindLastCodepoint("This is an ASCII string"); CHECK(index == 22); CHECK(index == 22); } TEST_CASE("Utils::FindLastCodepoint() BMP test") { auto [index, byteOffset] = Utils::FindLastCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); CHECK(index == 22); CHECK(byteOffset == 40); } Utils::CodepointInfo Utils::FindCodepoint(std::string_view str, size_t codepointIdx) { Utf8Iterator it{ str.begin() }; Utf8Iterator prev{ it }; size_t codepoint = 0; Utf8Iterator end{ str.end() }; while (true) { if (codepoint == codepointIdx) { return { codepoint, (size_t)std::distance(str.begin(), it.AsInternal()) }; } if (it == end) { return { codepoint - 1, (size_t)std::distance(str.begin(), prev.AsInternal()) }; } codepoint++; prev = it; it++; } } TEST_CASE("Utils::FindCodepoint() ASCII test") { auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 6); CHECK(codepointOffset == 6); CHECK(byteOffset == 6); } TEST_CASE("Utils::FindCodepoint() ASCII past-the-end test") { auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 100); CHECK(codepointOffset == 22); CHECK(byteOffset == 22); } TEST_CASE("Utils::FindCodepoint() BMP test") { auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 14); CHECK(codepointOffset == 14); CHECK(byteOffset == 16); } TEST_CASE("Utils::FindCodepoint() BMP past-the-end test") { auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 100); CHECK(codepointOffset == 22); CHECK(byteOffset == 40); } */