From f3269a49c474ffe4d382c3d60826ad1cfbb7cdc4 Mon Sep 17 00:00:00 2001 From: rtk0c Date: Fri, 25 Nov 2022 17:28:07 -0800 Subject: Changeset: 93 Branch comment: [] Port font and UTF-8 string utilities from p6503 --- source/10-common/String.cpp | 350 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 350 insertions(+) create mode 100644 source/10-common/String.cpp (limited to 'source/10-common/String.cpp') diff --git a/source/10-common/String.cpp b/source/10-common/String.cpp new file mode 100644 index 0000000..9e61893 --- /dev/null +++ b/source/10-common/String.cpp @@ -0,0 +1,350 @@ +#include "String.hpp" + +/* +#include +*/ + +Utf8Iterator::Utf8Iterator(std::string_view::iterator it) + : mIter{ std::move(it) } { +} + +constexpr unsigned char kFirstBitMask = 0b10000000; +constexpr unsigned char kSecondBitMask = 0b01000000; +constexpr unsigned char kThirdBitMask = 0b00100000; +constexpr unsigned char kFourthBitMask = 0b00010000; +constexpr unsigned char kFifthBitMask = 0b00001000; + +Utf8Iterator& Utf8Iterator::operator++() { + char firstByte = *mIter; + std::string::difference_type offset = 1; + + // This means the first byte has a value greater than 127, and so is beyond the ASCII range. + if (firstByte & kFirstBitMask) { + // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point. + if (firstByte & kThirdBitMask) { + // This means that the first byte has a value greater than 240, and so it must be a four-octet code point. + if (firstByte & kFourthBitMask) { + offset = 4; + } else { + offset = 3; + } + } else { + offset = 2; + } + } + + mIter += offset; + mDirty = true; + return *this; +} + +Utf8Iterator Utf8Iterator::operator++(int) { + Utf8Iterator temp = *this; + ++(*this); + return temp; +} + +Utf8Iterator& Utf8Iterator::operator--() { + --mIter; + + // This means that the previous byte is not an ASCII character. + if (*mIter & kFirstBitMask) { + --mIter; + if ((*mIter & kSecondBitMask) == 0) { + --mIter; + if ((*mIter & kSecondBitMask) == 0) { + --mIter; + } + } + } + + mDirty = true; + return *this; +} + +Utf8Iterator Utf8Iterator::operator--(int) { + Utf8Iterator temp = *this; + --(*this); + return temp; +} + +char32_t Utf8Iterator::operator*() const { + UpdateCurrentValue(); + return mCurrentCodePoint; +} + +std::string_view::iterator Utf8Iterator::AsInternal() const { + // updateCurrentValue(); + return mIter; +} + +bool operator==(const Utf8Iterator& lhs, const Utf8Iterator& rhs) { + return lhs.mIter == rhs.mIter; +} + +bool operator!=(const Utf8Iterator& lhs, const Utf8Iterator& rhs) { + return lhs.mIter != rhs.mIter; +} + +bool operator==(const Utf8Iterator& lhs, std::string_view::iterator rhs) { + return lhs.mIter == rhs; +} + +bool operator!=(const Utf8Iterator& lhs, std::string_view::iterator rhs) { + return lhs.mIter != rhs; +} + +void Utf8Iterator::UpdateCurrentValue() const { + if (!mDirty) { + return; + } + + mCurrentCodePoint = 0; + char firstByte = *mIter; + + // This means the first byte has a value greater than 127, and so is beyond the ASCII range. + if (firstByte & kFirstBitMask) { + // This means that the first byte has a value greater than 191, and so it must be at least a three-octet code point. + if (firstByte & kThirdBitMask) { + // This means that the first byte has a value greater than 224, and so it must be a four-octet code point. + if (firstByte & kFourthBitMask) { + mCurrentCodePoint = (firstByte & 0x07) << 18; + char secondByte = *(mIter + 1); + mCurrentCodePoint += (secondByte & 0x3f) << 12; + char thirdByte = *(mIter + 2); + mCurrentCodePoint += (thirdByte & 0x3f) << 6; + + char fourthByte = *(mIter + 3); + mCurrentCodePoint += (fourthByte & 0x3f); + } else { + mCurrentCodePoint = (firstByte & 0x0f) << 12; + char secondByte = *(mIter + 1); + mCurrentCodePoint += (secondByte & 0x3f) << 6; + char thirdByte = *(mIter + 2); + mCurrentCodePoint += (thirdByte & 0x3f); + } + } else { + mCurrentCodePoint = (firstByte & 0x1f) << 6; + char secondByte = *(mIter + 1); + mCurrentCodePoint += (secondByte & 0x3f); + } + } else { + mCurrentCodePoint = firstByte; + } + + mDirty = true; +} + +Utf8IterableString::Utf8IterableString(std::string_view str) + : mStr{ str } { +} + +Utf8Iterator Utf8IterableString::begin() const { + return Utf8Iterator(mStr.begin()); +} + +Utf8Iterator Utf8IterableString::end() const { + return Utf8Iterator(mStr.end()); +} + +/* +TEST_CASE("Iterating ASCII string") { + std::string ascii("This is an ASCII string"); + std::u32string output; + output.reserve(ascii.length()); + + for (char32_t c : Utf8IterableString(ascii)) { + output += c; + } + + CHECK(output == U"This is an ASCII string"); +} + +// BMP: Basic Multilingual Plane +TEST_CASE("Iterating BMP string") { + std::string unicode("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); + std::u32string output; + output.reserve(10); + + for (char32_t c : Utf8IterableString(unicode)) { + output += c; + } + + CHECK(output == U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); +} +*/ + +std::u32string Utils::ConvertUtf8To32(std::string_view in) { + std::u32string str; + // Actual size cannot be smaller than this + str.reserve(in.size()); + for (char32_t codepoint : Utf8IterableString(in)) { + str += codepoint; + } + return str; +} + +std::string Utils::ConvertUtf32To8(std::u32string_view in) { + std::string str; + for (char32_t codepoint : in) { + if (codepoint <= 0x7F) { + str += codepoint; + } else if (codepoint <= 0x7FF) { + str += 0xC0 | (codepoint >> 6); // 110xxxxx + str += 0x80 | (codepoint & 0x3F); // 10xxxxxx + } else if (codepoint <= 0xFFFF) { + str += 0xE0 | (codepoint >> 12); // 1110xxxx + str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx + str += 0x80 | (codepoint & 0x3F); // 10xxxxxx + } else if (codepoint <= 0x10FFFF) { + str += 0xF0 | (codepoint >> 18); // 11110xxx + str += 0x80 | ((codepoint >> 12) & 0x3F); // 10xxxxxx + str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx + str += 0x80 | (codepoint & 0x3F); // 10xxxxxx + } + } + return str; +} + +/* +TEST_CASE("Utils::ConvertUtf32To8() with ASCII") { + auto output = Utils::ConvertUtf32To8(U"This is an ASCII string"); + CHECK(output == "This is an ASCII string"); +} + +TEST_CASE("Utils::ConvertUtf32To8() with BMP codepoints") { + auto output = Utils::ConvertUtf32To8(U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); + CHECK(output == "Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); +} + */ + +std::string_view Utils::SliceUtf8(std::string_view str, size_t begin, size_t end) { + const char* resBegin; + size_t resLength = 0; + + Utf8Iterator it{ str.begin() }; + size_t i = 0; // Nth codepoint on the string + + // Skip until `it` points to the `begin`-th codepoint in the string + while (i < begin) { + i++; + it++; + } // Postcondition: i == begin + resBegin = &*it.AsInternal(); + + while (i < end) { + auto prev = it; + i++; + it++; + + resLength += std::distance(prev.AsInternal(), it.AsInternal()); + } // Postcondition: i == end + + return { resBegin, resLength }; +} + +/* +TEST_CASE("Utils::CreateRange() with ASCII") { + auto a = Utils::CreateRange("This is an ASCII string", 1, 1 + 5); + std::string range(a); + CHECK(range == "his i"); +} + +TEST_CASE("Utils::CreateRange() with BMP codepoints") { + std::string range(Utils::SliceUtf8("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 11, 11 + 5)); + CHECK(range == "t \u8FD9\u662F\u4E00"); +} +*/ + +size_t Utils::CountUtf8Codepoints(std::string_view str) { + size_t result = 0; + for (char32_t _ : Utf8IterableString(str)) { + result++; + } + return result; +} + +/* +TEST_CASE("Utils::GetLength() test") { + CHECK(Utils::GetLength("This is an ASCII string") == 23); + CHECK(Utils::CountUtf8Codepoints("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32") == 23); +} + +Utils::CodepointInfo Utils::FindLastCodepoint(std::string_view str) { + Utf8Iterator it{ str.begin() }; + Utf8Iterator prev{ it }; + size_t codepoints = 0; + + Utf8Iterator end{ str.end() }; + while (it != end) { + codepoints++; + + prev = it; + it++; + } + // it == end + // prev == + + return { + codepoints - 1, + (size_t)std::distance(str.begin(), prev.AsInternal()), + }; +} + +TEST_CASE("Utils::FindLastCodepoint() ASCII test") { + auto [index, byteOffset] = Utils::FindLastCodepoint("This is an ASCII string"); + CHECK(index == 22); + CHECK(index == 22); +} + +TEST_CASE("Utils::FindLastCodepoint() BMP test") { + auto [index, byteOffset] = Utils::FindLastCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); + CHECK(index == 22); + CHECK(byteOffset == 40); +} + +Utils::CodepointInfo Utils::FindCodepoint(std::string_view str, size_t codepointIdx) { + Utf8Iterator it{ str.begin() }; + Utf8Iterator prev{ it }; + size_t codepoint = 0; + + Utf8Iterator end{ str.end() }; + while (true) { + if (codepoint == codepointIdx) { + return { codepoint, (size_t)std::distance(str.begin(), it.AsInternal()) }; + } + if (it == end) { + return { codepoint - 1, (size_t)std::distance(str.begin(), prev.AsInternal()) }; + } + + codepoint++; + + prev = it; + it++; + } +} + +TEST_CASE("Utils::FindCodepoint() ASCII test") { + auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 6); + CHECK(codepointOffset == 6); + CHECK(byteOffset == 6); +} + +TEST_CASE("Utils::FindCodepoint() ASCII past-the-end test") { + auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 100); + CHECK(codepointOffset == 22); + CHECK(byteOffset == 22); +} + +TEST_CASE("Utils::FindCodepoint() BMP test") { + auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 14); + CHECK(codepointOffset == 14); + CHECK(byteOffset == 16); +} + +TEST_CASE("Utils::FindCodepoint() BMP past-the-end test") { + auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 100); + CHECK(codepointOffset == 22); + CHECK(byteOffset == 40); +} +*/ -- cgit v1.2.3-70-g09d2