1 files changed, 183 insertions, 0 deletions
diff --git a/source/20-codegen-compiler/CodegenLexer.cpp b/source/20-codegen-compiler/CodegenLexer.cpp
new file mode 100644
index 0000000..dab6aea
--- /dev/null
+++ b/source/20-codegen-compiler/CodegenLexer.cpp
@@ -0,0 +1,183 @@
+#include "CodegenLexer.hpp"
+
+#include <cassert>
+
+bool StbTokenIsSingleChar(int lexerToken) {
+	return lexerToken >= 0 && lexerToken < 256;
+}
+
+bool StbTokenIsMultiChar(int lexerToken) {
+	return !StbTokenIsMultiChar(lexerToken);
+}
+
+std::string CombineTokens(std::span<const StbLexerToken> tokens) {
+	size_t length = 0;
+	for (auto& token : tokens) {
+		length += token.text.size();
+	}
+	std::string result;
+	result.reserve(length);
+	for (auto& token : tokens) {
+		result += token.text;
+	}
+	return result;
+}
+
+const StbLexerToken& CodegenLexer::Current() const {
+	assert(idx < tokens.size());
+	return tokens[idx];
+}
+
+void CodegenLexer::InitializeFrom(std::string_view source) {
+	this->tokens = {};
+	this->idx = 0;
+
+	stb_lexer lexer;
+	char stringStorage[65536];
+	const char* srcBegin = source.data();
+	const char* srcEnd = srcBegin + source.length();
+	stb_c_lexer_init(&lexer, srcBegin, srcEnd, stringStorage, sizeof(stringStorage));
+
+	struct TokenCombiningPattern {
+		StbLexerToken result;
+		char matchChars[16];
+	};
+
+	const TokenCombiningPattern kDoubleColon = {
+		.result = {
+			.text = "::",
+			.type = CLEX_ext_double_colon,
+		},
+		.matchChars = { ':', ':', '\0' },
+	};
+	const TokenCombiningPattern kDotDotDot = {
+		.result = {
+			.text = "...",
+			.type = CLEX_ext_dot_dot_dot,
+		},
+		.matchChars = { '.', '.', '.', '\0' },
+	};
+
+	const TokenCombiningPattern* currentState = nullptr;
+	int currentStateCharIdx = 0;
+
+	while (true) {
+		// See stb_c_lexer.h's comments, here are a few additinos that aren't made clear in the file:
+		// - `lexer->token` (noted as "token" below) after calling stb_c_lexer_get_token() contains either:
+		//     1. 0 <= token < 256:          an ASCII character (more precisely a single char that the lexer ate; technically can be an incomplete code unit)
+		//     2. token < 0:                 an unknown token
+		//     3. One of the `CLEX_*` enums: a special, recognized token such as an operator
+
+		int stbToken = stb_c_lexer_get_token(&lexer);
+		if (stbToken == 0) {
+			// EOF
+			break;
+		}
+
+		if (lexer.token == CLEX_parse_error) {
+			printf("[ERROR] stb_c_lexer countered a parse error.\n");
+			// TODO how to handle?
+			continue;
+		}
+
+		StbLexerToken token;
+		if (StbTokenIsSingleChar(lexer.token)) {
+			char c = lexer.token;
+
+			token.type = CLEX_ext_single_char;
+			token.text = std::string(1, c);
+
+			if (!currentState) {
+#define TRY_START_MATCH(states)      \
+	if (states.matchChars[0] == c) { \
+		currentState = &states;      \
+		currentStateCharIdx = 1;     \
+	}
+				TRY_START_MATCH(kDoubleColon);
+				TRY_START_MATCH(kDotDotDot);
+#undef TRY_START_MATCH
+			} else {
+				if (currentState->matchChars[currentStateCharIdx] == c) {
+					// Match success
+					++currentStateCharIdx;
+
+					// If we matched all of the chars...
+					if (currentState->matchChars[currentStateCharIdx] == '\0') {
+						// We matched (currentStateCharIdx) tokens though this one is pushed into the vector, leaving (currentStateCharIdx - 1) tokens to be removed
+						for (int i = 0, count = currentStateCharIdx - 1; i < count; ++i) {
+							tokens.pop_back();
+						}
+
+						// Set the current token to desired result
+						token = currentState->result;
+
+						currentState = nullptr;
+						currentStateCharIdx = 0;
+					}
+				} else {
+					// Match fail, reset
+
+					currentState = nullptr;
+					currentStateCharIdx = 0;
+				}
+			}
+		} else {
+			token.type = lexer.token;
+			// WORKAROUND: use null terminated string, stb_c_lexer doens't set string_len properly when parsing identifiers
+			token.text = std::string(lexer.string);
+
+			switch (token.type) {
+				case CLEX_intlit:
+					token.lexerIntNumber = lexer.int_number;
+					break;
+
+				case CLEX_floatlit:
+					token.lexerRealNumber = lexer.real_number;
+					break;
+			}
+		}
+		tokens.push_back(std::move(token));
+		token = {};
+	}
+}
+
+const StbLexerToken* CodegenLexer::TryConsumeToken(int type) {
+	auto& token = tokens[idx];
+	if (token.type == type) {
+		++idx;
+		return &token;
+	}
+	return nullptr;
+}
+
+const StbLexerToken* CodegenLexer::TryConsumeSingleCharToken(char c) {
+	auto& token = tokens[idx];
+	if (token.type == CLEX_ext_single_char &&
+		token.text[0] == c)
+	{
+		++idx;
+		return &token;
+	}
+	return nullptr;
+}
+
+void CodegenLexer::SkipUntilToken(int type) {
+	while (idx < tokens.size()) {
+		if (Current().type == type) {
+			break;
+		}
+		++idx;
+	}
+}
+
+void CodegenLexer::SkipUntilTokenSingleChar(char c) {
+	while (idx < tokens.size()) {
+		auto& curr = Current();
+		if (curr.type == CLEX_ext_single_char &&
+			curr.text[0] == c)
+		{
+			break;
+		}
+		++idx;
+	}
+}