aboutsummaryrefslogtreecommitdiff
path: root/core/src/Utils/String.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'core/src/Utils/String.cpp')
-rw-r--r--core/src/Utils/String.cpp340
1 files changed, 0 insertions, 340 deletions
diff --git a/core/src/Utils/String.cpp b/core/src/Utils/String.cpp
deleted file mode 100644
index 94cd0f5..0000000
--- a/core/src/Utils/String.cpp
+++ /dev/null
@@ -1,340 +0,0 @@
-#include "String.hpp"
-
-#include <doctest/doctest.h>
-
-Utf8Iterator::Utf8Iterator(std::string_view::iterator it)
- : mIter{ std::move(it) } {
-}
-
-constexpr unsigned char kFirstBitMask = 0b10000000;
-constexpr unsigned char kSecondBitMask = 0b01000000;
-constexpr unsigned char kThirdBitMask = 0b00100000;
-constexpr unsigned char kFourthBitMask = 0b00010000;
-constexpr unsigned char kFifthBitMask = 0b00001000;
-
-Utf8Iterator& Utf8Iterator::operator++() {
- char firstByte = *mIter;
- std::string::difference_type offset = 1;
-
- // This means the first byte has a value greater than 127, and so is beyond the ASCII range.
- if (firstByte & kFirstBitMask) {
- // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point.
- if (firstByte & kThirdBitMask) {
- // This means that the first byte has a value greater than 240, and so it must be a four-octet code point.
- if (firstByte & kFourthBitMask) {
- offset = 4;
- } else {
- offset = 3;
- }
- } else {
- offset = 2;
- }
- }
-
- mIter += offset;
- mDirty = true;
- return *this;
-}
-
-Utf8Iterator Utf8Iterator::operator++(int) {
- Utf8Iterator temp = *this;
- ++(*this);
- return temp;
-}
-
-Utf8Iterator& Utf8Iterator::operator--() {
- --mIter;
-
- // This means that the previous byte is not an ASCII character.
- if (*mIter & kFirstBitMask) {
- --mIter;
- if ((*mIter & kSecondBitMask) == 0) {
- --mIter;
- if ((*mIter & kSecondBitMask) == 0) {
- --mIter;
- }
- }
- }
-
- mDirty = true;
- return *this;
-}
-
-Utf8Iterator Utf8Iterator::operator--(int) {
- Utf8Iterator temp = *this;
- --(*this);
- return temp;
-}
-
-char32_t Utf8Iterator::operator*() const {
- UpdateCurrentValue();
- return mCurrentCodePoint;
-}
-
-std::string_view::iterator Utf8Iterator::AsInternal() const {
- // updateCurrentValue();
- return mIter;
-}
-
-bool operator==(const Utf8Iterator& lhs, const Utf8Iterator& rhs) {
- return lhs.mIter == rhs.mIter;
-}
-
-bool operator!=(const Utf8Iterator& lhs, const Utf8Iterator& rhs) {
- return lhs.mIter != rhs.mIter;
-}
-
-bool operator==(const Utf8Iterator& lhs, std::string_view::iterator rhs) {
- return lhs.mIter == rhs;
-}
-
-bool operator!=(const Utf8Iterator& lhs, std::string_view::iterator rhs) {
- return lhs.mIter != rhs;
-}
-
-void Utf8Iterator::UpdateCurrentValue() const {
- if (!mDirty) {
- return;
- }
-
- mCurrentCodePoint = 0;
- char firstByte = *mIter;
-
- // This means the first byte has a value greater than 127, and so is beyond the ASCII range.
- if (firstByte & kFirstBitMask) {
- // This means that the first byte has a value greater than 191, and so it must be at least a three-octet code point.
- if (firstByte & kThirdBitMask) {
- // This means that the first byte has a value greater than 224, and so it must be a four-octet code point.
- if (firstByte & kFourthBitMask) {
- mCurrentCodePoint = (firstByte & 0x07) << 18;
- char secondByte = *(mIter + 1);
- mCurrentCodePoint += (secondByte & 0x3f) << 12;
- char thirdByte = *(mIter + 2);
- mCurrentCodePoint += (thirdByte & 0x3f) << 6;
-
- char fourthByte = *(mIter + 3);
- mCurrentCodePoint += (fourthByte & 0x3f);
- } else {
- mCurrentCodePoint = (firstByte & 0x0f) << 12;
- char secondByte = *(mIter + 1);
- mCurrentCodePoint += (secondByte & 0x3f) << 6;
- char thirdByte = *(mIter + 2);
- mCurrentCodePoint += (thirdByte & 0x3f);
- }
- } else {
- mCurrentCodePoint = (firstByte & 0x1f) << 6;
- char secondByte = *(mIter + 1);
- mCurrentCodePoint += (secondByte & 0x3f);
- }
- } else {
- mCurrentCodePoint = firstByte;
- }
-
- mDirty = true;
-}
-
-Utf8IterableString::Utf8IterableString(std::string_view str)
- : mStr{ str } {
-}
-
-Utf8Iterator Utf8IterableString::begin() const {
- return Utf8Iterator(mStr.begin());
-}
-
-Utf8Iterator Utf8IterableString::end() const {
- return Utf8Iterator(mStr.end());
-}
-
-TEST_CASE("Iterating ASCII string") {
- std::string ascii("This is an ASCII string");
- std::u32string output;
- output.reserve(ascii.length());
-
- for (char32_t c : Utf8IterableString(ascii)) {
- output += c;
- }
-
- CHECK(output == U"This is an ASCII string");
-}
-
-// BMP: Basic Multilingual Plane
-TEST_CASE("Iterating BMP string") {
- std::string unicode("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
- std::u32string output;
- output.reserve(10);
-
- for (char32_t c : Utf8IterableString(unicode)) {
- output += c;
- }
-
- CHECK(output == U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
-}
-
-std::u32string ConvertUtf8To32(std::string_view in) {
- std::u32string str;
- // Actual size cannot be smaller than this
- str.reserve(in.size());
- for (char32_t codepoint : Utf8IterableString(in)) {
- str += codepoint;
- }
- return str;
-}
-
-std::string ConvertUtf32To8(std::u32string_view in) {
- std::string str;
- for (char32_t codepoint : in) {
- if (codepoint <= 0x7F) {
- str += codepoint;
- } else if (codepoint <= 0x7FF) {
- str += 0xC0 | (codepoint >> 6); // 110xxxxx
- str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
- } else if (codepoint <= 0xFFFF) {
- str += 0xE0 | (codepoint >> 12); // 1110xxxx
- str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx
- str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
- } else if (codepoint <= 0x10FFFF) {
- str += 0xF0 | (codepoint >> 18); // 11110xxx
- str += 0x80 | ((codepoint >> 12) & 0x3F); // 10xxxxxx
- str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx
- str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
- }
- }
- return str;
-}
-
-TEST_CASE("convertUtf32To8() with ASCII") {
- auto output = ConvertUtf32To8(U"This is an ASCII string");
- CHECK(output == "This is an ASCII string");
-}
-
-TEST_CASE("convertUtf32To8() with BMP codepoints") {
- auto output = ConvertUtf32To8(U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
- CHECK(output == "Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
-}
-
-std::string_view StringRange(std::string_view str, size_t begin, size_t end) {
- const char* resBegin;
- size_t resLength = 0;
-
- Utf8Iterator it{ str.begin() };
- size_t i = 0; // Nth codepoint on the string
-
- // Skip until `it` points to the `begin`-th codepoint in the string
- while (i < begin) {
- i++;
- it++;
- } // Postcondition: i == begin
- resBegin = &*it.AsInternal();
-
- while (i < end) {
- auto prev = it;
- i++;
- it++;
-
- resLength += std::distance(prev.AsInternal(), it.AsInternal());
- } // Postcondition: i == end
-
- return { resBegin, resLength };
-}
-
-TEST_CASE("stringRange() with ASCII") {
- auto a = StringRange("This is an ASCII string", 1, 1 + 5);
- std::string range(a);
- CHECK(range == "his i");
-}
-
-TEST_CASE("stringRange() with BMP codepoints") {
- std::string range(StringRange("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 11, 11 + 5));
- CHECK(range == "t \u8FD9\u662F\u4E00");
-}
-
-size_t StringLength(std::string_view str) {
- size_t result = 0;
- for (char32_t _ : Utf8IterableString(str)) {
- result++;
- }
- return result;
-}
-
-TEST_CASE("StringLength() test") {
- CHECK(StringLength("This is an ASCII string") == 23);
- CHECK(StringLength("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32") == 23);
-}
-
-CodepointInfo StringLastCodepoint(std::string_view str) {
- Utf8Iterator it{ str.begin() };
- Utf8Iterator prev{ it };
- size_t codepoints = 0;
-
- Utf8Iterator end{ str.end() };
- while (it != end) {
- codepoints++;
-
- prev = it;
- it++;
- }
- // it == end
- // prev == <last codepoint in str>
-
- return {
- codepoints - 1,
- (size_t)std::distance(str.begin(), prev.AsInternal()),
- };
-}
-
-TEST_CASE("stringLastCodepoint() ASCII test") {
- auto [index, byteOffset] = StringLastCodepoint("This is an ASCII string");
- CHECK(index == 22);
- CHECK(index == 22);
-}
-
-TEST_CASE("stringLastCodepoint() BMP test") {
- auto [index, byteOffset] = StringLastCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
- CHECK(index == 22);
- CHECK(byteOffset == 40);
-}
-
-CodepointInfo StringCodepoint(std::string_view str, size_t codepointIdx) {
- Utf8Iterator it{ str.begin() };
- Utf8Iterator prev{ it };
- size_t codepoint = 0;
-
- Utf8Iterator end{ str.end() };
- while (true) {
- if (codepoint == codepointIdx) {
- return { codepoint, (size_t)std::distance(str.begin(), it.AsInternal()) };
- }
- if (it == end) {
- return { codepoint - 1, (size_t)std::distance(str.begin(), prev.AsInternal()) };
- }
-
- codepoint++;
-
- prev = it;
- it++;
- }
-}
-
-TEST_CASE("stringCodepoint() ASCII test") {
- auto [codepointOffset, byteOffset] = StringCodepoint("This is an ASCII string", 6);
- CHECK(codepointOffset == 6);
- CHECK(byteOffset == 6);
-}
-
-TEST_CASE("stringCodepoint() ASCII past-the-end test") {
- auto [codepointOffset, byteOffset] = StringCodepoint("This is an ASCII string", 100);
- CHECK(codepointOffset == 22);
- CHECK(byteOffset == 22);
-}
-
-TEST_CASE("stringCodepoint() BMP test") {
- auto [codepointOffset, byteOffset] = StringCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 14);
- CHECK(codepointOffset == 14);
- CHECK(byteOffset == 16);
-}
-
-TEST_CASE("stringCodepoint() BMP past-the-end test") {
- auto [codepointOffset, byteOffset] = StringCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 100);
- CHECK(codepointOffset == 22);
- CHECK(byteOffset == 40);
-}