2 files changed, 429 insertions, 0 deletions
diff --git a/source/10-common/String.cpp b/source/10-common/String.cpp
new file mode 100644
index 0000000..9e61893
--- /dev/null
+++ b/source/10-common/String.cpp
@@ -0,0 +1,350 @@
+#include "String.hpp"
+
+/*
+#include <doctest/doctest.h>
+*/
+
+Utf8Iterator::Utf8Iterator(std::string_view::iterator it)
+	: mIter{ std::move(it) } {
+}
+
+constexpr unsigned char kFirstBitMask = 0b10000000;
+constexpr unsigned char kSecondBitMask = 0b01000000;
+constexpr unsigned char kThirdBitMask = 0b00100000;
+constexpr unsigned char kFourthBitMask = 0b00010000;
+constexpr unsigned char kFifthBitMask = 0b00001000;
+
+Utf8Iterator& Utf8Iterator::operator++() {
+	char firstByte = *mIter;
+	std::string::difference_type offset = 1;
+
+	// This means the first byte has a value greater than 127, and so is beyond the ASCII range.
+	if (firstByte & kFirstBitMask) {
+		// This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point.
+		if (firstByte & kThirdBitMask) {
+			// This means that the first byte has a value greater than 240, and so it must be a four-octet code point.
+			if (firstByte & kFourthBitMask) {
+				offset = 4;
+			} else {
+				offset = 3;
+			}
+		} else {
+			offset = 2;
+		}
+	}
+
+	mIter += offset;
+	mDirty = true;
+	return *this;
+}
+
+Utf8Iterator Utf8Iterator::operator++(int) {
+	Utf8Iterator temp = *this;
+	++(*this);
+	return temp;
+}
+
+Utf8Iterator& Utf8Iterator::operator--() {
+	--mIter;
+
+	// This means that the previous byte is not an ASCII character.
+	if (*mIter & kFirstBitMask) {
+		--mIter;
+		if ((*mIter & kSecondBitMask) == 0) {
+			--mIter;
+			if ((*mIter & kSecondBitMask) == 0) {
+				--mIter;
+			}
+		}
+	}
+
+	mDirty = true;
+	return *this;
+}
+
+Utf8Iterator Utf8Iterator::operator--(int) {
+	Utf8Iterator temp = *this;
+	--(*this);
+	return temp;
+}
+
+char32_t Utf8Iterator::operator*() const {
+	UpdateCurrentValue();
+	return mCurrentCodePoint;
+}
+
+std::string_view::iterator Utf8Iterator::AsInternal() const {
+	// updateCurrentValue();
+	return mIter;
+}
+
+bool operator==(const Utf8Iterator& lhs, const Utf8Iterator& rhs) {
+	return lhs.mIter == rhs.mIter;
+}
+
+bool operator!=(const Utf8Iterator& lhs, const Utf8Iterator& rhs) {
+	return lhs.mIter != rhs.mIter;
+}
+
+bool operator==(const Utf8Iterator& lhs, std::string_view::iterator rhs) {
+	return lhs.mIter == rhs;
+}
+
+bool operator!=(const Utf8Iterator& lhs, std::string_view::iterator rhs) {
+	return lhs.mIter != rhs;
+}
+
+void Utf8Iterator::UpdateCurrentValue() const {
+	if (!mDirty) {
+		return;
+	}
+
+	mCurrentCodePoint = 0;
+	char firstByte = *mIter;
+
+	// This means the first byte has a value greater than 127, and so is beyond the ASCII range.
+	if (firstByte & kFirstBitMask) {
+		// This means that the first byte has a value greater than 191, and so it must be at least a three-octet code point.
+		if (firstByte & kThirdBitMask) {
+			// This means that the first byte has a value greater than 224, and so it must be a four-octet code point.
+			if (firstByte & kFourthBitMask) {
+				mCurrentCodePoint = (firstByte & 0x07) << 18;
+				char secondByte = *(mIter + 1);
+				mCurrentCodePoint += (secondByte & 0x3f) << 12;
+				char thirdByte = *(mIter + 2);
+				mCurrentCodePoint += (thirdByte & 0x3f) << 6;
+
+				char fourthByte = *(mIter + 3);
+				mCurrentCodePoint += (fourthByte & 0x3f);
+			} else {
+				mCurrentCodePoint = (firstByte & 0x0f) << 12;
+				char secondByte = *(mIter + 1);
+				mCurrentCodePoint += (secondByte & 0x3f) << 6;
+				char thirdByte = *(mIter + 2);
+				mCurrentCodePoint += (thirdByte & 0x3f);
+			}
+		} else {
+			mCurrentCodePoint = (firstByte & 0x1f) << 6;
+			char secondByte = *(mIter + 1);
+			mCurrentCodePoint += (secondByte & 0x3f);
+		}
+	} else {
+		mCurrentCodePoint = firstByte;
+	}
+
+	mDirty = true;
+}
+
+Utf8IterableString::Utf8IterableString(std::string_view str)
+	: mStr{ str } {
+}
+
+Utf8Iterator Utf8IterableString::begin() const {
+	return Utf8Iterator(mStr.begin());
+}
+
+Utf8Iterator Utf8IterableString::end() const {
+	return Utf8Iterator(mStr.end());
+}
+
+/*
+TEST_CASE("Iterating ASCII string") {
+	std::string ascii("This is an ASCII string");
+	std::u32string output;
+	output.reserve(ascii.length());
+
+	for (char32_t c : Utf8IterableString(ascii)) {
+		output += c;
+	}
+
+	CHECK(output == U"This is an ASCII string");
+}
+
+// BMP: Basic Multilingual Plane
+TEST_CASE("Iterating BMP string") {
+	std::string unicode("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
+	std::u32string output;
+	output.reserve(10);
+
+	for (char32_t c : Utf8IterableString(unicode)) {
+		output += c;
+	}
+
+	CHECK(output == U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
+}
+*/
+
+std::u32string Utils::ConvertUtf8To32(std::string_view in) {
+	std::u32string str;
+	// Actual size cannot be smaller than this
+	str.reserve(in.size());
+	for (char32_t codepoint : Utf8IterableString(in)) {
+		str += codepoint;
+	}
+	return str;
+}
+
+std::string Utils::ConvertUtf32To8(std::u32string_view in) {
+	std::string str;
+	for (char32_t codepoint : in) {
+		if (codepoint <= 0x7F) {
+			str += codepoint;
+		} else if (codepoint <= 0x7FF) {
+			str += 0xC0 | (codepoint >> 6); // 110xxxxx
+			str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
+		} else if (codepoint <= 0xFFFF) {
+			str += 0xE0 | (codepoint >> 12); // 1110xxxx
+			str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx
+			str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
+		} else if (codepoint <= 0x10FFFF) {
+			str += 0xF0 | (codepoint >> 18); // 11110xxx
+			str += 0x80 | ((codepoint >> 12) & 0x3F); // 10xxxxxx
+			str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx
+			str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
+		}
+	}
+	return str;
+}
+
+/*
+TEST_CASE("Utils::ConvertUtf32To8() with ASCII") {
+	auto output = Utils::ConvertUtf32To8(U"This is an ASCII string");
+	CHECK(output == "This is an ASCII string");
+}
+
+TEST_CASE("Utils::ConvertUtf32To8() with BMP codepoints") {
+	auto output = Utils::ConvertUtf32To8(U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
+	CHECK(output == "Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
+}
+ */
+
+std::string_view Utils::SliceUtf8(std::string_view str, size_t begin, size_t end) {
+	const char* resBegin;
+	size_t resLength = 0;
+
+	Utf8Iterator it{ str.begin() };
+	size_t i = 0; // Nth codepoint on the string
+
+	// Skip until `it` points to the `begin`-th codepoint in the string
+	while (i < begin) {
+		i++;
+		it++;
+	} // Postcondition: i == begin
+	resBegin = &*it.AsInternal();
+
+	while (i < end) {
+		auto prev = it;
+		i++;
+		it++;
+
+		resLength += std::distance(prev.AsInternal(), it.AsInternal());
+	} // Postcondition: i == end
+
+	return { resBegin, resLength };
+}
+
+/*
+TEST_CASE("Utils::CreateRange() with ASCII") {
+	auto a = Utils::CreateRange("This is an ASCII string", 1, 1 + 5);
+	std::string range(a);
+	CHECK(range == "his i");
+}
+
+TEST_CASE("Utils::CreateRange() with BMP codepoints") {
+	std::string range(Utils::SliceUtf8("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 11, 11 + 5));
+	CHECK(range == "t \u8FD9\u662F\u4E00");
+}
+*/
+
+size_t Utils::CountUtf8Codepoints(std::string_view str) {
+	size_t result = 0;
+	for (char32_t _ : Utf8IterableString(str)) {
+		result++;
+	}
+	return result;
+}
+
+/*
+TEST_CASE("Utils::GetLength() test") {
+	CHECK(Utils::GetLength("This is an ASCII string") == 23);
+	CHECK(Utils::CountUtf8Codepoints("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32") == 23);
+}
+
+Utils::CodepointInfo Utils::FindLastCodepoint(std::string_view str) {
+	Utf8Iterator it{ str.begin() };
+	Utf8Iterator prev{ it };
+	size_t codepoints = 0;
+
+	Utf8Iterator end{ str.end() };
+	while (it != end) {
+		codepoints++;
+
+		prev = it;
+		it++;
+	}
+	// it == end
+	// prev == <last codepoint in str>
+
+	return {
+		codepoints - 1,
+		(size_t)std::distance(str.begin(), prev.AsInternal()),
+	};
+}
+
+TEST_CASE("Utils::FindLastCodepoint() ASCII test") {
+	auto [index, byteOffset] = Utils::FindLastCodepoint("This is an ASCII string");
+	CHECK(index == 22);
+	CHECK(index == 22);
+}
+
+TEST_CASE("Utils::FindLastCodepoint() BMP test") {
+	auto [index, byteOffset] = Utils::FindLastCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
+	CHECK(index == 22);
+	CHECK(byteOffset == 40);
+}
+
+Utils::CodepointInfo Utils::FindCodepoint(std::string_view str, size_t codepointIdx) {
+	Utf8Iterator it{ str.begin() };
+	Utf8Iterator prev{ it };
+	size_t codepoint = 0;
+
+	Utf8Iterator end{ str.end() };
+	while (true) {
+		if (codepoint == codepointIdx) {
+			return { codepoint, (size_t)std::distance(str.begin(), it.AsInternal()) };
+		}
+		if (it == end) {
+			return { codepoint - 1, (size_t)std::distance(str.begin(), prev.AsInternal()) };
+		}
+
+		codepoint++;
+
+		prev = it;
+		it++;
+	}
+}
+
+TEST_CASE("Utils::FindCodepoint() ASCII test") {
+	auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 6);
+	CHECK(codepointOffset == 6);
+	CHECK(byteOffset == 6);
+}
+
+TEST_CASE("Utils::FindCodepoint() ASCII past-the-end test") {
+	auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 100);
+	CHECK(codepointOffset == 22);
+	CHECK(byteOffset == 22);
+}
+
+TEST_CASE("Utils::FindCodepoint() BMP test") {
+	auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 14);
+	CHECK(codepointOffset == 14);
+	CHECK(byteOffset == 16);
+}
+
+TEST_CASE("Utils::FindCodepoint() BMP past-the-end test") {
+	auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 100);
+	CHECK(codepointOffset == 22);
+	CHECK(byteOffset == 40);
+}
+*/
diff --git a/source/10-common/String.hpp b/source/10-common/String.hpp
new file mode 100644
index 0000000..8d54bad
--- /dev/null
+++ b/source/10-common/String.hpp
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <string_view>
+
+class Utf8Iterator {
+public:
+	using iterator_category = std::bidirectional_iterator_tag;
+	using value_type = char32_t;
+	using difference_type = std::string_view::difference_type;
+	using pointer = const char32_t*;
+	using reference = const char32_t&;
+
+private:
+	std::string_view::iterator mIter;
+	mutable char32_t mCurrentCodePoint = 0;
+	mutable bool mDirty = true;
+
+public:
+	Utf8Iterator(std::string_view::iterator it);
+	~Utf8Iterator() = default;
+
+	Utf8Iterator(const Utf8Iterator& that) = default;
+	Utf8Iterator& operator=(const Utf8Iterator& that) = default;
+	Utf8Iterator(Utf8Iterator&& that) = default;
+	Utf8Iterator& operator=(Utf8Iterator&& that) = default;
+
+	Utf8Iterator& operator++();
+	Utf8Iterator operator++(int);
+	Utf8Iterator& operator--();
+	Utf8Iterator operator--(int);
+
+	char32_t operator*() const;
+	std::string_view::iterator AsInternal() const;
+
+	friend bool operator==(const Utf8Iterator& lhs, const Utf8Iterator& rhs);
+	friend bool operator!=(const Utf8Iterator& lhs, const Utf8Iterator& rhs);
+	friend bool operator==(const Utf8Iterator& lhs, std::string_view::iterator rhs);
+	friend bool operator!=(const Utf8Iterator& lhs, std::string_view::iterator rhs);
+
+private:
+	void UpdateCurrentValue() const;
+};
+
+class Utf8IterableString {
+private:
+	std::string_view mStr;
+
+public:
+	Utf8IterableString(std::string_view str);
+	Utf8Iterator begin() const;
+	Utf8Iterator end() const;
+};
+
+namespace Utils {
+
+std::u32string ConvertUtf8To32(std::string_view str);
+std::string ConvertUtf32To8(std::u32string_view str);
+
+/// Slice the given UTF-8 string into the given range, in codepoints.
+std::string_view SliceUtf8(std::string_view str, size_t begin, size_t end);
+
+/// Calculate the given UTF-8 string's number of codepoints.
+size_t CountUtf8Codepoints(std::string_view str);
+
+struct CodepointInfo {
+	size_t index;
+	size_t byteOffset;
+};
+
+/// Find info about the last codepoint in the given UTF-8 string.
+/// \param str A non-empty UTF-8 encoded string.
+CodepointInfo FindLastCodepoint(std::string_view str);
+/// Find info about the nth codepoint in the given UTF-8 string. If codepointIdx is larger than the length, info for the last codepoint will be returned.
+/// \param str A non-empty UTF-8 encoded string.
+CodepointInfo FindCodepoint(std::string_view str, size_t codepointIdx);
+
+} // namespace Utils