#include "String.hpp"

/*
#include <doctest/doctest.h>
*/

Utf8Iterator::Utf8Iterator(std::string_view::iterator it)
	: mIter{ std::move(it) } {
}

constexpr unsigned char kFirstBitMask = 0b10000000;
constexpr unsigned char kSecondBitMask = 0b01000000;
constexpr unsigned char kThirdBitMask = 0b00100000;
constexpr unsigned char kFourthBitMask = 0b00010000;
constexpr unsigned char kFifthBitMask = 0b00001000;

Utf8Iterator& Utf8Iterator::operator++() {
	char firstByte = *mIter;
	std::string::difference_type offset = 1;

	// This means the first byte has a value greater than 127, and so is beyond the ASCII range.
	if (firstByte & kFirstBitMask) {
		// This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point.
		if (firstByte & kThirdBitMask) {
			// This means that the first byte has a value greater than 240, and so it must be a four-octet code point.
			if (firstByte & kFourthBitMask) {
				offset = 4;
			} else {
				offset = 3;
			}
		} else {
			offset = 2;
		}
	}

	mIter += offset;
	mDirty = true;
	return *this;
}

Utf8Iterator Utf8Iterator::operator++(int) {
	Utf8Iterator temp = *this;
	++(*this);
	return temp;
}

Utf8Iterator& Utf8Iterator::operator--() {
	--mIter;

	// This means that the previous byte is not an ASCII character.
	if (*mIter & kFirstBitMask) {
		--mIter;
		if ((*mIter & kSecondBitMask) == 0) {
			--mIter;
			if ((*mIter & kSecondBitMask) == 0) {
				--mIter;
			}
		}
	}

	mDirty = true;
	return *this;
}

Utf8Iterator Utf8Iterator::operator--(int) {
	Utf8Iterator temp = *this;
	--(*this);
	return temp;
}

char32_t Utf8Iterator::operator*() const {
	UpdateCurrentValue();
	return mCurrentCodePoint;
}

std::string_view::iterator Utf8Iterator::AsInternal() const {
	// updateCurrentValue();
	return mIter;
}

bool operator==(const Utf8Iterator& lhs, const Utf8Iterator& rhs) {
	return lhs.mIter == rhs.mIter;
}

bool operator!=(const Utf8Iterator& lhs, const Utf8Iterator& rhs) {
	return lhs.mIter != rhs.mIter;
}

bool operator==(const Utf8Iterator& lhs, std::string_view::iterator rhs) {
	return lhs.mIter == rhs;
}

bool operator!=(const Utf8Iterator& lhs, std::string_view::iterator rhs) {
	return lhs.mIter != rhs;
}

void Utf8Iterator::UpdateCurrentValue() const {
	if (!mDirty) {
		return;
	}

	mCurrentCodePoint = 0;
	char firstByte = *mIter;

	// This means the first byte has a value greater than 127, and so is beyond the ASCII range.
	if (firstByte & kFirstBitMask) {
		// This means that the first byte has a value greater than 191, and so it must be at least a three-octet code point.
		if (firstByte & kThirdBitMask) {
			// This means that the first byte has a value greater than 224, and so it must be a four-octet code point.
			if (firstByte & kFourthBitMask) {
				mCurrentCodePoint = (firstByte & 0x07) << 18;
				char secondByte = *(mIter + 1);
				mCurrentCodePoint += (secondByte & 0x3f) << 12;
				char thirdByte = *(mIter + 2);
				mCurrentCodePoint += (thirdByte & 0x3f) << 6;

				char fourthByte = *(mIter + 3);
				mCurrentCodePoint += (fourthByte & 0x3f);
			} else {
				mCurrentCodePoint = (firstByte & 0x0f) << 12;
				char secondByte = *(mIter + 1);
				mCurrentCodePoint += (secondByte & 0x3f) << 6;
				char thirdByte = *(mIter + 2);
				mCurrentCodePoint += (thirdByte & 0x3f);
			}
		} else {
			mCurrentCodePoint = (firstByte & 0x1f) << 6;
			char secondByte = *(mIter + 1);
			mCurrentCodePoint += (secondByte & 0x3f);
		}
	} else {
		mCurrentCodePoint = firstByte;
	}

	mDirty = true;
}

Utf8IterableString::Utf8IterableString(std::string_view str)
	: mStr{ str } {
}

Utf8Iterator Utf8IterableString::begin() const {
	return Utf8Iterator(mStr.begin());
}

Utf8Iterator Utf8IterableString::end() const {
	return Utf8Iterator(mStr.end());
}

/*
TEST_CASE("Iterating ASCII string") {
	std::string ascii("This is an ASCII string");
	std::u32string output;
	output.reserve(ascii.length());

	for (char32_t c : Utf8IterableString(ascii)) {
		output += c;
	}

	CHECK(output == U"This is an ASCII string");
}

// BMP: Basic Multilingual Plane
TEST_CASE("Iterating BMP string") {
	std::string unicode("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
	std::u32string output;
	output.reserve(10);

	for (char32_t c : Utf8IterableString(unicode)) {
		output += c;
	}

	CHECK(output == U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
}
*/

std::u32string Utils::ConvertUtf8To32(std::string_view in) {
	std::u32string str;
	// Actual size cannot be smaller than this
	str.reserve(in.size());
	for (char32_t codepoint : Utf8IterableString(in)) {
		str += codepoint;
	}
	return str;
}

std::string Utils::ConvertUtf32To8(std::u32string_view in) {
	std::string str;
	for (char32_t codepoint : in) {
		if (codepoint <= 0x7F) {
			str += codepoint;
		} else if (codepoint <= 0x7FF) {
			str += 0xC0 | (codepoint >> 6); // 110xxxxx
			str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
		} else if (codepoint <= 0xFFFF) {
			str += 0xE0 | (codepoint >> 12); // 1110xxxx
			str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx
			str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
		} else if (codepoint <= 0x10FFFF) {
			str += 0xF0 | (codepoint >> 18); // 11110xxx
			str += 0x80 | ((codepoint >> 12) & 0x3F); // 10xxxxxx
			str += 0x80 | ((codepoint >> 6) & 0x3F); // 10xxxxxx
			str += 0x80 | (codepoint & 0x3F); // 10xxxxxx
		}
	}
	return str;
}

/*
TEST_CASE("Utils::ConvertUtf32To8() with ASCII") {
	auto output = Utils::ConvertUtf32To8(U"This is an ASCII string");
	CHECK(output == "This is an ASCII string");
}

TEST_CASE("Utils::ConvertUtf32To8() with BMP codepoints") {
	auto output = Utils::ConvertUtf32To8(U"Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
	CHECK(output == "Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
}
 */

std::string_view Utils::SliceUtf8(std::string_view str, size_t begin, size_t end) {
	const char* resBegin;
	size_t resLength = 0;

	Utf8Iterator it{ str.begin() };
	size_t i = 0; // Nth codepoint on the string

	// Skip until `it` points to the `begin`-th codepoint in the string
	while (i < begin) {
		i++;
		it++;
	} // Postcondition: i == begin
	resBegin = &*it.AsInternal();

	while (i < end) {
		auto prev = it;
		i++;
		it++;

		resLength += std::distance(prev.AsInternal(), it.AsInternal());
	} // Postcondition: i == end

	return { resBegin, resLength };
}

/*
TEST_CASE("Utils::CreateRange() with ASCII") {
	auto a = Utils::CreateRange("This is an ASCII string", 1, 1 + 5);
	std::string range(a);
	CHECK(range == "his i");
}

TEST_CASE("Utils::CreateRange() with BMP codepoints") {
	std::string range(Utils::SliceUtf8("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 11, 11 + 5));
	CHECK(range == "t \u8FD9\u662F\u4E00");
}
*/

size_t Utils::CountUtf8Codepoints(std::string_view str) {
	size_t result = 0;
	for (char32_t _ : Utf8IterableString(str)) {
		result++;
	}
	return result;
}

/*
TEST_CASE("Utils::GetLength() test") {
	CHECK(Utils::GetLength("This is an ASCII string") == 23);
	CHECK(Utils::CountUtf8Codepoints("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32") == 23);
}

Utils::CodepointInfo Utils::FindLastCodepoint(std::string_view str) {
	Utf8Iterator it{ str.begin() };
	Utf8Iterator prev{ it };
	size_t codepoints = 0;

	Utf8Iterator end{ str.end() };
	while (it != end) {
		codepoints++;

		prev = it;
		it++;
	}
	// it == end
	// prev == <last codepoint in str>

	return {
		codepoints - 1,
		(size_t)std::distance(str.begin(), prev.AsInternal()),
	};
}

TEST_CASE("Utils::FindLastCodepoint() ASCII test") {
	auto [index, byteOffset] = Utils::FindLastCodepoint("This is an ASCII string");
	CHECK(index == 22);
	CHECK(index == 22);
}

TEST_CASE("Utils::FindLastCodepoint() BMP test") {
	auto [index, byteOffset] = Utils::FindLastCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32");
	CHECK(index == 22);
	CHECK(byteOffset == 40);
}

Utils::CodepointInfo Utils::FindCodepoint(std::string_view str, size_t codepointIdx) {
	Utf8Iterator it{ str.begin() };
	Utf8Iterator prev{ it };
	size_t codepoint = 0;

	Utf8Iterator end{ str.end() };
	while (true) {
		if (codepoint == codepointIdx) {
			return { codepoint, (size_t)std::distance(str.begin(), it.AsInternal()) };
		}
		if (it == end) {
			return { codepoint - 1, (size_t)std::distance(str.begin(), prev.AsInternal()) };
		}

		codepoint++;

		prev = it;
		it++;
	}
}

TEST_CASE("Utils::FindCodepoint() ASCII test") {
	auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 6);
	CHECK(codepointOffset == 6);
	CHECK(byteOffset == 6);
}

TEST_CASE("Utils::FindCodepoint() ASCII past-the-end test") {
	auto [codepointOffset, byteOffset] = Utils::FindCodepoint("This is an ASCII string", 100);
	CHECK(codepointOffset == 22);
	CHECK(byteOffset == 22);
}

TEST_CASE("Utils::FindCodepoint() BMP test") {
	auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 14);
	CHECK(codepointOffset == 14);
	CHECK(byteOffset == 16);
}

TEST_CASE("Utils::FindCodepoint() BMP past-the-end test") {
	auto [codepointOffset, byteOffset] = Utils::FindCodepoint("Unicode test \u8FD9\u662F\u4E00\u4E2A\u6D4B\u8BD5\u7528\u5B57\u7B26\u4E32", 100);
	CHECK(codepointOffset == 22);
	CHECK(byteOffset == 40);
}
*/