aboutsummaryrefslogtreecommitdiff
path: root/source/10-common
diff options
context:
space:
mode:
Diffstat (limited to 'source/10-common')
-rw-r--r--source/10-common/String.cpp350
-rw-r--r--source/10-common/String.hpp79
2 files changed, 429 insertions, 0 deletions
diff --git a/source/10-common/String.cpp b/source/10-common/String.cpp
new file mode 100644
index 0000000..9e61893
--- /dev/null
+++ b/source/10-common/String.cpp
@@ -0,0 +1,350 @@
+#include "String.hpp"
+
+/*
+#include <doctest/doctest.h>
+*/
+
+Utf8Iterator::Utf8Iterator(std::string_view::iterator it)
+ : mIter{ std::move(it) } {
+}
+
+constexpr unsigned char kFirstBitMask = 0b10000000;
+constexpr unsigned char kSecondBitMask = 0b01000000;
+constexpr unsigned char kThirdBitMask = 0b00100000;
+constexpr unsigned char kFourthBitMask = 0b00010000;
+constexpr unsigned char kFifthBitMask = 0b00001000;
+
+Utf8Iterator& Utf8Iterator::operator++() {
+ char firstByte = *mIter;
+ std::string::difference_type offset = 1;
+
+ // This means the first byte has a value greater than 127, and so is beyond the ASCII range.
+ if (firstByte & kFirstBitMask) {
+ // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point.
+ if (firstByte & kThirdBitMask) {
+ // This means that the first byte has a value greater than 240, and so it must be a four-octet code point.
+ if (firstByte & kFourthBitMask) {
+ offset = 4;
+ } else {
+ offset = 3;
+ }
+ } else {
+ offset = 2;
+ }
+ }
+
+ mIter += offset;
+ mDirty = true;
+ return *this;
+}
+
+Utf8Iterator Utf8Iterator::operator++(int) {
+ Utf8Iterator temp = *this;
+ ++(*this);
+ return temp;
+}
+
+Utf8Iterator& Utf8Iterator::operator--() {
+ --mIter;
+
+ // This means that the previous byte is not an ASCII character.
+ if (*mIter & kFirstBitMask) {
+ --mIter;
+ if ((*mIter & kSecondBitMask) == 0) {
+ --mIter;
+ if ((*mIter & kSecondBitMask) == 0) {
+ --mIter;
+ }
+ }
+ }
+
+ mDirty = true;
+ return *this;
+}
+
+Utf8Iterator Utf8Iterator::operator--(int) {
+ Utf8Iterator temp = *this;
+ --(*this);
+ return temp;
+}
+
+char32_t Utf8Iterator::operator*() const {
+ UpdateCurrentValue();
+ return mCurrentCodePoint;
+}
+
+std::string_view::iterator Utf8Iterator::AsInternal() const {
+ // updateCurrentValue();
+ return mIter;
+}
+
+bool operator==(const Utf8Iterator& lhs, const Utf8Iterator& rhs) {
+ return lhs.mIter == rhs.mIter;
+}
+
+bool operator!=(const Utf8Iterator& lhs, const Utf8Iterator& rhs) {
+ return lhs.mIter != rhs.mIter;
+}
+
+bool operator==(const Utf8Iterator& lhs, std::string_view::iterator rhs) {
+ return lhs.mIter == rhs;
+}
+
+bool operator!=(const Utf8Iterator& lhs, std::string_view::iterator rhs) {
+ return lhs.mIter != rhs;
+}
+
+void Utf8Iterator::UpdateCurrentValue() const {
+ if (!mDirty) {
+ return;
+ }
+
+ mCurrentCodePoint = 0;
+ char firstByte = *mIter;
+
+ // This means the first byte has a value greater than 127, and so is beyond the ASCII range.
+ if (firstByte & kFirstBitMask) {
+ // This means that the first byte has a value greater than 191, and so it must be at least a three-octet code point.
+ if (firstByte & kThirdBitMask) {
+ // This means that the first byte has a value greater than 224, and so it must be a four-octet code point.
+ if (firstByte & kFourthBitMask) {
+ mCurrentCodePoint = (firstByte & 0x07) << 18;
+ char secondByte = *(mIter + 1);
+ mCurrentCodePoint += (secondByte & 0x3f) << 12;
+ char thirdByte = *(mIter + 2);
+ mCurrentCodePoint += (thirdByte & 0x3f) << 6;
+
+ char fourthByte = *(mIter + 3);
+ mCurrentCodePoint += (fourthByte & 0x3f);
+ } else {
+ mCurrentCodePoint = (firstByte & 0x0f) << 12;
+ char secondByte = *(mIter + 1);
+ mCurrentCodePoint += (secondByte & 0x3f) << 6;
+ char thirdByte = *(mIter + 2);
+ mCurrentCodePoint += (thirdByte & 0x3f);
+ }
+ } else {
+ mCurrentCodePoint = (firstByte & 0x1f) << 6;
+ char secondByte = *(mIter + 1);
+ mCurrentCodePoint += (secondByte & 0x3f);
+ }
+ } else {
+ mCurrentCodePoint = firstByte;
+ }
+
+ mDirty = true;
+}
+
+Utf8IterableString::Utf8IterableString(std::string_view str)
+ : mStr{ str } {
+}
+
+Utf8Iterator Utf8IterableString::begin() const {
+ return Utf8Iterator(mStr.begin());
+}
+
+Utf8Iterator Utf8IterableString::end() const {
+ return Utf8Iterator(mStr.end());
+}
+
+/*
+TEST_CASE("Iterating ASCII string") {
+ std::string ascii("This is an ASCII string");
+ std::u32string output;
+ output.reserve(ascii.length());
+
+ for (char32_t c : Utf8IterableString(ascii)) {
+ output += c;
+ }
+
+ CHECK(output == U"This is an ASCII string");
+}
+
+// BMP: Basic Multilingual Plane
+TEST_CASE("Iterating BMP string") {
+ std::string unicode("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
+ std::u32string output;
+ output.reserve(10);
+
+ for (char32_t c : Utf8IterableString(unicode)) {
+ output += c;
+ }
+
+ CHECK(output == U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
+}
+*/
+
+std::u32string Utils::ConvertUtf8To32(std::string_view in) {
+ std::u32string str;
+ // Actual size cannot be smaller than this
+ str.reserve(in.size());
+ for (char32_t codepoint : Utf8IterableString(in)) {
+ str += codepoint;
+ }
+ return str;
+}
+
+std::string Utils::ConvertUtf32To8(std::u32string_view in) {
+ std::string str;
+ for (char32_t codepoint : in) {
+ if (codepoint <= 0x7F) {
+ str += codepoint;
+ } else if (codepoint <= 0x7FF) {
+ str += 0xC0 | (codepoint >> 6); // 110xxxxx
+ str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
+ } else if (codepoint <= 0xFFFF) {
+ str += 0xE0 | (codepoint >> 12); // 1110xxxx
+ str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx
+ str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
+ } else if (codepoint <= 0x10FFFF) {
+ str += 0xF0 | (codepoint >> 18); // 11110xxx
+ str += 0x80 | ((codepoint >> 12) & 0x3F); // 10xxxxxx
+ str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx
+ str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
+ }
+ }
+ return str;
+}
+
+/*
+TEST_CASE("Utils::ConvertUtf32To8() with ASCII") {
+ auto output = Utils::ConvertUtf32To8(U"This is an ASCII string");
+ CHECK(output == "This is an ASCII string");
+}
+
+TEST_CASE("Utils::ConvertUtf32To8() with BMP codepoints") {
+ auto output = Utils::ConvertUtf32To8(U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
+ CHECK(output == "Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
+}
+ */
+
+std::string_view Utils::SliceUtf8(std::string_view str, size_t begin, size_t end) {
+ const char* resBegin;
+ size_t resLength = 0;
+
+ Utf8Iterator it{ str.begin() };
+ size_t i = 0; // Nth codepoint on the string
+
+ // Skip until `it` points to the `begin`-th codepoint in the string
+ while (i < begin) {
+ i++;
+ it++;
+ } // Postcondition: i == begin
+ resBegin = &*it.AsInternal();
+
+ while (i < end) {
+ auto prev = it;
+ i++;
+ it++;
+
+ resLength += std::distance(prev.AsInternal(), it.AsInternal());
+ } // Postcondition: i == end
+
+ return { resBegin, resLength };
+}
+
+/*
+TEST_CASE("Utils::CreateRange() with ASCII") {
+ auto a = Utils::CreateRange("This is an ASCII string", 1, 1 + 5);
+ std::string range(a);
+ CHECK(range == "his i");
+}
+
+TEST_CASE("Utils::CreateRange() with BMP codepoints") {
+ std::string range(Utils::SliceUtf8("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 11, 11 + 5));
+ CHECK(range == "t \u8FD9\u662F\u4E00");
+}
+*/
+
+size_t Utils::CountUtf8Codepoints(std::string_view str) {
+ size_t result = 0;
+ for (char32_t _ : Utf8IterableString(str)) {
+ result++;
+ }
+ return result;
+}
+
+/*
+TEST_CASE("Utils::GetLength() test") {
+ CHECK(Utils::GetLength("This is an ASCII string") == 23);
+ CHECK(Utils::CountUtf8Codepoints("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32") == 23);
+}
+
+Utils::CodepointInfo Utils::FindLastCodepoint(std::string_view str) {
+ Utf8Iterator it{ str.begin() };
+ Utf8Iterator prev{ it };
+ size_t codepoints = 0;
+
+ Utf8Iterator end{ str.end() };
+ while (it != end) {
+ codepoints++;
+
+ prev = it;
+ it++;
+ }
+ // it == end
+ // prev == <last codepoint in str>
+
+ return {
+ codepoints - 1,
+ (size_t)std::distance(str.begin(), prev.AsInternal()),
+ };
+}
+
+TEST_CASE("Utils::FindLastCodepoint() ASCII test") {
+ auto [index, byteOffset] = Utils::FindLastCodepoint("This is an ASCII string");
+ CHECK(index == 22);
+ CHECK(index == 22);
+}
+
+TEST_CASE("Utils::FindLastCodepoint() BMP test") {
+ auto [index, byteOffset] = Utils::FindLastCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
+ CHECK(index == 22);
+ CHECK(byteOffset == 40);
+}
+
+Utils::CodepointInfo Utils::FindCodepoint(std::string_view str, size_t codepointIdx) {
+ Utf8Iterator it{ str.begin() };
+ Utf8Iterator prev{ it };
+ size_t codepoint = 0;
+
+ Utf8Iterator end{ str.end() };
+ while (true) {
+ if (codepoint == codepointIdx) {
+ return { codepoint, (size_t)std::distance(str.begin(), it.AsInternal()) };
+ }
+ if (it == end) {
+ return { codepoint - 1, (size_t)std::distance(str.begin(), prev.AsInternal()) };
+ }
+
+ codepoint++;
+
+ prev = it;
+ it++;
+ }
+}
+
+TEST_CASE("Utils::FindCodepoint() ASCII test") {
+ auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 6);
+ CHECK(codepointOffset == 6);
+ CHECK(byteOffset == 6);
+}
+
+TEST_CASE("Utils::FindCodepoint() ASCII past-the-end test") {
+ auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 100);
+ CHECK(codepointOffset == 22);
+ CHECK(byteOffset == 22);
+}
+
+TEST_CASE("Utils::FindCodepoint() BMP test") {
+ auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 14);
+ CHECK(codepointOffset == 14);
+ CHECK(byteOffset == 16);
+}
+
+TEST_CASE("Utils::FindCodepoint() BMP past-the-end test") {
+ auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 100);
+ CHECK(codepointOffset == 22);
+ CHECK(byteOffset == 40);
+}
+*/
diff --git a/source/10-common/String.hpp b/source/10-common/String.hpp
new file mode 100644
index 0000000..8d54bad
--- /dev/null
+++ b/source/10-common/String.hpp
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <string_view>
+
+class Utf8Iterator {
+public:
+ using iterator_category = std::bidirectional_iterator_tag;
+ using value_type = char32_t;
+ using difference_type = std::string_view::difference_type;
+ using pointer = const char32_t*;
+ using reference = const char32_t&;
+
+private:
+ std::string_view::iterator mIter;
+ mutable char32_t mCurrentCodePoint = 0;
+ mutable bool mDirty = true;
+
+public:
+ Utf8Iterator(std::string_view::iterator it);
+ ~Utf8Iterator() = default;
+
+ Utf8Iterator(const Utf8Iterator& that) = default;
+ Utf8Iterator& operator=(const Utf8Iterator& that) = default;
+ Utf8Iterator(Utf8Iterator&& that) = default;
+ Utf8Iterator& operator=(Utf8Iterator&& that) = default;
+
+ Utf8Iterator& operator++();
+ Utf8Iterator operator++(int);
+ Utf8Iterator& operator--();
+ Utf8Iterator operator--(int);
+
+ char32_t operator*() const;
+ std::string_view::iterator AsInternal() const;
+
+ friend bool operator==(const Utf8Iterator& lhs, const Utf8Iterator& rhs);
+ friend bool operator!=(const Utf8Iterator& lhs, const Utf8Iterator& rhs);
+ friend bool operator==(const Utf8Iterator& lhs, std::string_view::iterator rhs);
+ friend bool operator!=(const Utf8Iterator& lhs, std::string_view::iterator rhs);
+
+private:
+ void UpdateCurrentValue() const;
+};
+
+class Utf8IterableString {
+private:
+ std::string_view mStr;
+
+public:
+ Utf8IterableString(std::string_view str);
+ Utf8Iterator begin() const;
+ Utf8Iterator end() const;
+};
+
+namespace Utils {
+
+std::u32string ConvertUtf8To32(std::string_view str);
+std::string ConvertUtf32To8(std::u32string_view str);
+
+/// Slice the given UTF-8 string into the given range, in codepoints.
+std::string_view SliceUtf8(std::string_view str, size_t begin, size_t end);
+
+/// Calculate the given UTF-8 string's number of codepoints.
+size_t CountUtf8Codepoints(std::string_view str);
+
+struct CodepointInfo {
+ size_t index;
+ size_t byteOffset;
+};
+
+/// Find info about the last codepoint in the given UTF-8 string.
+/// \param str A non-empty UTF-8 encoded string.
+CodepointInfo FindLastCodepoint(std::string_view str);
+/// Find info about the nth codepoint in the given UTF-8 string. If codepointIdx is larger than the length, info for the last codepoint will be returned.
+/// \param str A non-empty UTF-8 encoded string.
+CodepointInfo FindCodepoint(std::string_view str, size_t codepointIdx);
+
+} // namespace Utils