The great renaming: switch to "module style"

author: rtk0c <[email protected]> 2023-10-19 22:50:07 -0700
committer: rtk0c <[email protected]> 2025-08-16 11:31:16 -0700
commit: 297232d21594b138bb368a42b5b0d085ff9ed6aa (patch)
tree: 075d5407e1e12a9d35cbee6e4c20ad34e0765c42 /src/brussel.codegen.comp/CodegenLexer.cpp
parent: d5cd34ff69f7fd134d5450696f298af1a864afbc (diff)
1 files changed, 202 insertions, 0 deletions
diff --git a/src/brussel.codegen.comp/CodegenLexer.cpp b/src/brussel.codegen.comp/CodegenLexer.cpp
new file mode 100644
index 0000000..ecb2186
--- /dev/null
+++ b/src/brussel.codegen.comp/CodegenLexer.cpp
@@ -0,0 +1,202 @@
+#include "CodegenLexer.hpp"
+
+#include <cassert>
+
+int StbLexerToken::Reamalgamate() const {
+	if (type == CLEX_ext_single_char) {
+		return text[0];
+	} else {
+		return type;
+	}
+}
+
+bool StbTokenIsSingleChar(int lexerToken) {
+	return lexerToken >= 0 && lexerToken < 256;
+}
+
+bool StbTokenIsMultiChar(int lexerToken) {
+	return !StbTokenIsMultiChar(lexerToken);
+}
+
+std::string CombineTokens(std::span<const StbLexerToken> tokens, std::string_view separator) {
+	if (tokens.empty()) {
+		return {};
+	}
+
+	size_t length = 0;
+	for (auto& token : tokens) {
+		length += token.text.size();
+		length += separator.size();
+	}
+	// Intentionally counting an extra separator: leave space for the last append below
+
+	std::string result;
+	result.reserve(length);
+	for (auto& token : tokens) {
+		result += token.text;
+		result += separator;
+	}
+	// Remove the trailing separator
+	result.resize(result.size() - separator.size());
+
+	return result;
+}
+
+const StbLexerToken& CodegenLexer::Current() const {
+	assert(idx < tokens.size());
+	return tokens[idx];
+}
+
+void CodegenLexer::InitializeFrom(std::string_view source) {
+	this->tokens = {};
+	this->idx = 0;
+
+	stb_lexer lexer;
+	char stringStorage[65536];
+	const char* srcBegin = source.data();
+	const char* srcEnd = srcBegin + source.length();
+	stb_c_lexer_init(&lexer, srcBegin, srcEnd, stringStorage, sizeof(stringStorage));
+
+	struct TokenCombiningPattern {
+		StbLexerToken result;
+		char matchChars[16];
+	};
+
+	const TokenCombiningPattern kDoubleColon = {
+		.result = {
+			.text = "::",
+			.type = CLEX_ext_double_colon,
+		},
+		.matchChars = { ':', ':', '\0' },
+	};
+	const TokenCombiningPattern kDotDotDot = {
+		.result = {
+			.text = "...",
+			.type = CLEX_ext_dot_dot_dot,
+		},
+		.matchChars = { '.', '.', '.', '\0' },
+	};
+
+	const TokenCombiningPattern* currentState = nullptr;
+	int currentStateCharIdx = 0;
+
+	while (true) {
+		// See stb_c_lexer.h's comments, here are a few additinos that aren't made clear in the file:
+		// - `lexer->token` (noted as "token" below) after calling stb_c_lexer_get_token() contains either:
+		//     1. 0 <= token < 256:          an ASCII character (more precisely a single char that the lexer ate; technically can be an incomplete code unit)
+		//     2. token < 0:                 an unknown token
+		//     3. One of the `CLEX_*` enums: a special, recognized token such as an operator
+
+		int stbToken = stb_c_lexer_get_token(&lexer);
+		if (stbToken == 0) {
+			// EOF
+			break;
+		}
+
+		if (lexer.token == CLEX_parse_error) {
+			printf("[ERROR] stb_c_lexer countered a parse error.\n");
+			// TODO how to handle?
+			continue;
+		}
+
+		StbLexerToken token;
+		if (StbTokenIsSingleChar(lexer.token)) {
+			char c = lexer.token;
+
+			token.type = CLEX_ext_single_char;
+			token.text = std::string(1, c);
+
+			if (!currentState) {
+#define TRY_START_MATCH(states)      \
+	if (states.matchChars[0] == c) { \
+		currentState = &states;      \
+		currentStateCharIdx = 1;     \
+	}
+				TRY_START_MATCH(kDoubleColon);
+				TRY_START_MATCH(kDotDotDot);
+#undef TRY_START_MATCH
+			} else {
+				if (currentState->matchChars[currentStateCharIdx] == c) {
+					// Match success
+					++currentStateCharIdx;
+
+					// If we matched all of the chars...
+					if (currentState->matchChars[currentStateCharIdx] == '\0') {
+						// We matched (currentStateCharIdx) tokens though this one is pushed into the vector, leaving (currentStateCharIdx - 1) tokens to be removed
+						for (int i = 0, count = currentStateCharIdx - 1; i < count; ++i) {
+							tokens.pop_back();
+						}
+
+						// Set the current token to desired result
+						token = currentState->result;
+
+						currentState = nullptr;
+						currentStateCharIdx = 0;
+					}
+				} else {
+					// Match fail, reset
+
+					currentState = nullptr;
+					currentStateCharIdx = 0;
+				}
+			}
+		} else {
+			token.type = lexer.token;
+			// WORKAROUND: use null terminated string, stb_c_lexer doens't set string_len properly when parsing identifiers
+			token.text = std::string(lexer.string);
+
+			switch (token.type) {
+				case CLEX_intlit:
+					token.lexerIntNumber = lexer.int_number;
+					break;
+
+				case CLEX_floatlit:
+					token.lexerRealNumber = lexer.real_number;
+					break;
+			}
+		}
+		tokens.push_back(std::move(token));
+		token = {};
+	}
+}
+
+const StbLexerToken* CodegenLexer::TryConsumeToken(int type) {
+	auto& token = tokens[idx];
+	if (token.type == type) {
+		++idx;
+		return &token;
+	}
+	return nullptr;
+}
+
+const StbLexerToken* CodegenLexer::TryConsumeSingleCharToken(char c) {
+	auto& token = tokens[idx];
+	if (token.type == CLEX_ext_single_char &&
+		token.text[0] == c)
+	{
+		++idx;
+		return &token;
+	}
+	return nullptr;
+}
+
+void CodegenLexer::SkipUntilToken(int type) {
+	while (idx < tokens.size()) {
+		if (Current().type == type) {
+			break;
+		}
+		++idx;
+	}
+}
+
+void CodegenLexer::SkipUntilTokenSingleChar(char c) {
+	while (idx < tokens.size()) {
+		auto& curr = Current();
+		if (curr.type == CLEX_ext_single_char &&
+			curr.text[0] == c)
+		{
+			break;
+		}
+		++idx;
+	}
+}
author	rtk0c <[email protected]>	2023-10-19 22:50:07 -0700
committer	rtk0c <[email protected]>	2025-08-16 11:31:16 -0700
commit	297232d21594b138bb368a42b5b0d085ff9ed6aa (patch)
tree	075d5407e1e12a9d35cbee6e4c20ad34e0765c42 /src/brussel.codegen.comp/CodegenLexer.cpp
parent	d5cd34ff69f7fd134d5450696f298af1a864afbc (diff)