From 44f5fa5c8f258e8fc1f7d7e2e45e0485bd6cc490 Mon Sep 17 00:00:00 2001 From: rtk0c Date: Wed, 31 Mar 2021 20:19:18 -0700 Subject: Complete items tab (UI and serialization) --- core/src/Utils/String.cpp | 340 ---------------------------------------------- 1 file changed, 340 deletions(-) delete mode 100644 core/src/Utils/String.cpp (limited to 'core/src/Utils/String.cpp') diff --git a/core/src/Utils/String.cpp b/core/src/Utils/String.cpp deleted file mode 100644 index 94cd0f5..0000000 --- a/core/src/Utils/String.cpp +++ /dev/null @@ -1,340 +0,0 @@ -#include "String.hpp" - -#include - -Utf8Iterator::Utf8Iterator(std::string_view::iterator it) - : mIter{ std::move(it) } { -} - -constexpr unsigned char kFirstBitMask = 0b10000000; -constexpr unsigned char kSecondBitMask = 0b01000000; -constexpr unsigned char kThirdBitMask = 0b00100000; -constexpr unsigned char kFourthBitMask = 0b00010000; -constexpr unsigned char kFifthBitMask = 0b00001000; - -Utf8Iterator& Utf8Iterator::operator++() { - char firstByte = *mIter; - std::string::difference_type offset = 1; - - // This means the first byte has a value greater than 127, and so is beyond the ASCII range. - if (firstByte & kFirstBitMask) { - // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point. - if (firstByte & kThirdBitMask) { - // This means that the first byte has a value greater than 240, and so it must be a four-octet code point. - if (firstByte & kFourthBitMask) { - offset = 4; - } else { - offset = 3; - } - } else { - offset = 2; - } - } - - mIter += offset; - mDirty = true; - return *this; -} - -Utf8Iterator Utf8Iterator::operator++(int) { - Utf8Iterator temp = *this; - ++(*this); - return temp; -} - -Utf8Iterator& Utf8Iterator::operator--() { - --mIter; - - // This means that the previous byte is not an ASCII character. - if (*mIter & kFirstBitMask) { - --mIter; - if ((*mIter & kSecondBitMask) == 0) { - --mIter; - if ((*mIter & kSecondBitMask) == 0) { - --mIter; - } - } - } - - mDirty = true; - return *this; -} - -Utf8Iterator Utf8Iterator::operator--(int) { - Utf8Iterator temp = *this; - --(*this); - return temp; -} - -char32_t Utf8Iterator::operator*() const { - UpdateCurrentValue(); - return mCurrentCodePoint; -} - -std::string_view::iterator Utf8Iterator::AsInternal() const { - // updateCurrentValue(); - return mIter; -} - -bool operator==(const Utf8Iterator& lhs, const Utf8Iterator& rhs) { - return lhs.mIter == rhs.mIter; -} - -bool operator!=(const Utf8Iterator& lhs, const Utf8Iterator& rhs) { - return lhs.mIter != rhs.mIter; -} - -bool operator==(const Utf8Iterator& lhs, std::string_view::iterator rhs) { - return lhs.mIter == rhs; -} - -bool operator!=(const Utf8Iterator& lhs, std::string_view::iterator rhs) { - return lhs.mIter != rhs; -} - -void Utf8Iterator::UpdateCurrentValue() const { - if (!mDirty) { - return; - } - - mCurrentCodePoint = 0; - char firstByte = *mIter; - - // This means the first byte has a value greater than 127, and so is beyond the ASCII range. - if (firstByte & kFirstBitMask) { - // This means that the first byte has a value greater than 191, and so it must be at least a three-octet code point. - if (firstByte & kThirdBitMask) { - // This means that the first byte has a value greater than 224, and so it must be a four-octet code point. - if (firstByte & kFourthBitMask) { - mCurrentCodePoint = (firstByte & 0x07) << 18; - char secondByte = *(mIter + 1); - mCurrentCodePoint += (secondByte & 0x3f) << 12; - char thirdByte = *(mIter + 2); - mCurrentCodePoint += (thirdByte & 0x3f) << 6; - - char fourthByte = *(mIter + 3); - mCurrentCodePoint += (fourthByte & 0x3f); - } else { - mCurrentCodePoint = (firstByte & 0x0f) << 12; - char secondByte = *(mIter + 1); - mCurrentCodePoint += (secondByte & 0x3f) << 6; - char thirdByte = *(mIter + 2); - mCurrentCodePoint += (thirdByte & 0x3f); - } - } else { - mCurrentCodePoint = (firstByte & 0x1f) << 6; - char secondByte = *(mIter + 1); - mCurrentCodePoint += (secondByte & 0x3f); - } - } else { - mCurrentCodePoint = firstByte; - } - - mDirty = true; -} - -Utf8IterableString::Utf8IterableString(std::string_view str) - : mStr{ str } { -} - -Utf8Iterator Utf8IterableString::begin() const { - return Utf8Iterator(mStr.begin()); -} - -Utf8Iterator Utf8IterableString::end() const { - return Utf8Iterator(mStr.end()); -} - -TEST_CASE("Iterating ASCII string") { - std::string ascii("This is an ASCII string"); - std::u32string output; - output.reserve(ascii.length()); - - for (char32_t c : Utf8IterableString(ascii)) { - output += c; - } - - CHECK(output == U"This is an ASCII string"); -} - -// BMP: Basic Multilingual Plane -TEST_CASE("Iterating BMP string") { - std::string unicode("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); - std::u32string output; - output.reserve(10); - - for (char32_t c : Utf8IterableString(unicode)) { - output += c; - } - - CHECK(output == U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); -} - -std::u32string ConvertUtf8To32(std::string_view in) { - std::u32string str; - // Actual size cannot be smaller than this - str.reserve(in.size()); - for (char32_t codepoint : Utf8IterableString(in)) { - str += codepoint; - } - return str; -} - -std::string ConvertUtf32To8(std::u32string_view in) { - std::string str; - for (char32_t codepoint : in) { - if (codepoint <= 0x7F) { - str += codepoint; - } else if (codepoint <= 0x7FF) { - str += 0xC0 | (codepoint >> 6); // 110xxxxx - str += 0x80 | (codepoint & 0x3F); // 10xxxxxx - } else if (codepoint <= 0xFFFF) { - str += 0xE0 | (codepoint >> 12); // 1110xxxx - str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx - str += 0x80 | (codepoint & 0x3F); // 10xxxxxx - } else if (codepoint <= 0x10FFFF) { - str += 0xF0 | (codepoint >> 18); // 11110xxx - str += 0x80 | ((codepoint >> 12) & 0x3F); // 10xxxxxx - str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx - str += 0x80 | (codepoint & 0x3F); // 10xxxxxx - } - } - return str; -} - -TEST_CASE("convertUtf32To8() with ASCII") { - auto output = ConvertUtf32To8(U"This is an ASCII string"); - CHECK(output == "This is an ASCII string"); -} - -TEST_CASE("convertUtf32To8() with BMP codepoints") { - auto output = ConvertUtf32To8(U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); - CHECK(output == "Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); -} - -std::string_view StringRange(std::string_view str, size_t begin, size_t end) { - const char* resBegin; - size_t resLength = 0; - - Utf8Iterator it{ str.begin() }; - size_t i = 0; // Nth codepoint on the string - - // Skip until `it` points to the `begin`-th codepoint in the string - while (i < begin) { - i++; - it++; - } // Postcondition: i == begin - resBegin = &*it.AsInternal(); - - while (i < end) { - auto prev = it; - i++; - it++; - - resLength += std::distance(prev.AsInternal(), it.AsInternal()); - } // Postcondition: i == end - - return { resBegin, resLength }; -} - -TEST_CASE("stringRange() with ASCII") { - auto a = StringRange("This is an ASCII string", 1, 1 + 5); - std::string range(a); - CHECK(range == "his i"); -} - -TEST_CASE("stringRange() with BMP codepoints") { - std::string range(StringRange("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 11, 11 + 5)); - CHECK(range == "t \u8FD9\u662F\u4E00"); -} - -size_t StringLength(std::string_view str) { - size_t result = 0; - for (char32_t _ : Utf8IterableString(str)) { - result++; - } - return result; -} - -TEST_CASE("StringLength() test") { - CHECK(StringLength("This is an ASCII string") == 23); - CHECK(StringLength("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32") == 23); -} - -CodepointInfo StringLastCodepoint(std::string_view str) { - Utf8Iterator it{ str.begin() }; - Utf8Iterator prev{ it }; - size_t codepoints = 0; - - Utf8Iterator end{ str.end() }; - while (it != end) { - codepoints++; - - prev = it; - it++; - } - // it == end - // prev == - - return { - codepoints - 1, - (size_t)std::distance(str.begin(), prev.AsInternal()), - }; -} - -TEST_CASE("stringLastCodepoint() ASCII test") { - auto [index, byteOffset] = StringLastCodepoint("This is an ASCII string"); - CHECK(index == 22); - CHECK(index == 22); -} - -TEST_CASE("stringLastCodepoint() BMP test") { - auto [index, byteOffset] = StringLastCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32"); - CHECK(index == 22); - CHECK(byteOffset == 40); -} - -CodepointInfo StringCodepoint(std::string_view str, size_t codepointIdx) { - Utf8Iterator it{ str.begin() }; - Utf8Iterator prev{ it }; - size_t codepoint = 0; - - Utf8Iterator end{ str.end() }; - while (true) { - if (codepoint == codepointIdx) { - return { codepoint, (size_t)std::distance(str.begin(), it.AsInternal()) }; - } - if (it == end) { - return { codepoint - 1, (size_t)std::distance(str.begin(), prev.AsInternal()) }; - } - - codepoint++; - - prev = it; - it++; - } -} - -TEST_CASE("stringCodepoint() ASCII test") { - auto [codepointOffset, byteOffset] = StringCodepoint("This is an ASCII string", 6); - CHECK(codepointOffset == 6); - CHECK(byteOffset == 6); -} - -TEST_CASE("stringCodepoint() ASCII past-the-end test") { - auto [codepointOffset, byteOffset] = StringCodepoint("This is an ASCII string", 100); - CHECK(codepointOffset == 22); - CHECK(byteOffset == 22); -} - -TEST_CASE("stringCodepoint() BMP test") { - auto [codepointOffset, byteOffset] = StringCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 14); - CHECK(codepointOffset == 14); - CHECK(byteOffset == 16); -} - -TEST_CASE("stringCodepoint() BMP past-the-end test") { - auto [codepointOffset, byteOffset] = StringCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 100); - CHECK(codepointOffset == 22); - CHECK(byteOffset == 40); -} -- cgit v1.2.3-70-g09d2