aboutsummaryrefslogtreecommitdiff
path: root/src/brussel.codegen.comp/CodegenLexer.cpp
diff options
context:
space:
mode:
authorrtk0c <[email protected]>2023-10-19 22:50:07 -0700
committerrtk0c <[email protected]>2025-08-16 11:31:16 -0700
commit297232d21594b138bb368a42b5b0d085ff9ed6aa (patch)
tree075d5407e1e12a9d35cbee6e4c20ad34e0765c42 /src/brussel.codegen.comp/CodegenLexer.cpp
parentd5cd34ff69f7fd134d5450696f298af1a864afbc (diff)
The great renaming: switch to "module style"
Diffstat (limited to 'src/brussel.codegen.comp/CodegenLexer.cpp')
-rw-r--r--src/brussel.codegen.comp/CodegenLexer.cpp202
1 files changed, 202 insertions, 0 deletions
diff --git a/src/brussel.codegen.comp/CodegenLexer.cpp b/src/brussel.codegen.comp/CodegenLexer.cpp
new file mode 100644
index 0000000..ecb2186
--- /dev/null
+++ b/src/brussel.codegen.comp/CodegenLexer.cpp
@@ -0,0 +1,202 @@
+#include "CodegenLexer.hpp"
+
+#include <cassert>
+
+int StbLexerToken::Reamalgamate() const {
+ if (type == CLEX_ext_single_char) {
+ return text[0];
+ } else {
+ return type;
+ }
+}
+
+bool StbTokenIsSingleChar(int lexerToken) {
+ return lexerToken >= 0 && lexerToken < 256;
+}
+
+bool StbTokenIsMultiChar(int lexerToken) {
+ return !StbTokenIsMultiChar(lexerToken);
+}
+
+std::string CombineTokens(std::span<const StbLexerToken> tokens, std::string_view separator) {
+ if (tokens.empty()) {
+ return {};
+ }
+
+ size_t length = 0;
+ for (auto& token : tokens) {
+ length += token.text.size();
+ length += separator.size();
+ }
+ // Intentionally counting an extra separator: leave space for the last append below
+
+ std::string result;
+ result.reserve(length);
+ for (auto& token : tokens) {
+ result += token.text;
+ result += separator;
+ }
+ // Remove the trailing separator
+ result.resize(result.size() - separator.size());
+
+ return result;
+}
+
+const StbLexerToken& CodegenLexer::Current() const {
+ assert(idx < tokens.size());
+ return tokens[idx];
+}
+
+void CodegenLexer::InitializeFrom(std::string_view source) {
+ this->tokens = {};
+ this->idx = 0;
+
+ stb_lexer lexer;
+ char stringStorage[65536];
+ const char* srcBegin = source.data();
+ const char* srcEnd = srcBegin + source.length();
+ stb_c_lexer_init(&lexer, srcBegin, srcEnd, stringStorage, sizeof(stringStorage));
+
+ struct TokenCombiningPattern {
+ StbLexerToken result;
+ char matchChars[16];
+ };
+
+ const TokenCombiningPattern kDoubleColon = {
+ .result = {
+ .text = "::",
+ .type = CLEX_ext_double_colon,
+ },
+ .matchChars = { ':', ':', '\0' },
+ };
+ const TokenCombiningPattern kDotDotDot = {
+ .result = {
+ .text = "...",
+ .type = CLEX_ext_dot_dot_dot,
+ },
+ .matchChars = { '.', '.', '.', '\0' },
+ };
+
+ const TokenCombiningPattern* currentState = nullptr;
+ int currentStateCharIdx = 0;
+
+ while (true) {
+ // See stb_c_lexer.h's comments, here are a few additinos that aren't made clear in the file:
+ // - `lexer->token` (noted as "token" below) after calling stb_c_lexer_get_token() contains either:
+ // 1. 0 <= token < 256: an ASCII character (more precisely a single char that the lexer ate; technically can be an incomplete code unit)
+ // 2. token < 0: an unknown token
+ // 3. One of the `CLEX_*` enums: a special, recognized token such as an operator
+
+ int stbToken = stb_c_lexer_get_token(&lexer);
+ if (stbToken == 0) {
+ // EOF
+ break;
+ }
+
+ if (lexer.token == CLEX_parse_error) {
+ printf("[ERROR] stb_c_lexer countered a parse error.\n");
+ // TODO how to handle?
+ continue;
+ }
+
+ StbLexerToken token;
+ if (StbTokenIsSingleChar(lexer.token)) {
+ char c = lexer.token;
+
+ token.type = CLEX_ext_single_char;
+ token.text = std::string(1, c);
+
+ if (!currentState) {
+#define TRY_START_MATCH(states) \
+ if (states.matchChars[0] == c) { \
+ currentState = &states; \
+ currentStateCharIdx = 1; \
+ }
+ TRY_START_MATCH(kDoubleColon);
+ TRY_START_MATCH(kDotDotDot);
+#undef TRY_START_MATCH
+ } else {
+ if (currentState->matchChars[currentStateCharIdx] == c) {
+ // Match success
+ ++currentStateCharIdx;
+
+ // If we matched all of the chars...
+ if (currentState->matchChars[currentStateCharIdx] == '\0') {
+ // We matched (currentStateCharIdx) tokens though this one is pushed into the vector, leaving (currentStateCharIdx - 1) tokens to be removed
+ for (int i = 0, count = currentStateCharIdx - 1; i < count; ++i) {
+ tokens.pop_back();
+ }
+
+ // Set the current token to desired result
+ token = currentState->result;
+
+ currentState = nullptr;
+ currentStateCharIdx = 0;
+ }
+ } else {
+ // Match fail, reset
+
+ currentState = nullptr;
+ currentStateCharIdx = 0;
+ }
+ }
+ } else {
+ token.type = lexer.token;
+ // WORKAROUND: use null terminated string, stb_c_lexer doens't set string_len properly when parsing identifiers
+ token.text = std::string(lexer.string);
+
+ switch (token.type) {
+ case CLEX_intlit:
+ token.lexerIntNumber = lexer.int_number;
+ break;
+
+ case CLEX_floatlit:
+ token.lexerRealNumber = lexer.real_number;
+ break;
+ }
+ }
+ tokens.push_back(std::move(token));
+ token = {};
+ }
+}
+
+const StbLexerToken* CodegenLexer::TryConsumeToken(int type) {
+ auto& token = tokens[idx];
+ if (token.type == type) {
+ ++idx;
+ return &token;
+ }
+ return nullptr;
+}
+
+const StbLexerToken* CodegenLexer::TryConsumeSingleCharToken(char c) {
+ auto& token = tokens[idx];
+ if (token.type == CLEX_ext_single_char &&
+ token.text[0] == c)
+ {
+ ++idx;
+ return &token;
+ }
+ return nullptr;
+}
+
+void CodegenLexer::SkipUntilToken(int type) {
+ while (idx < tokens.size()) {
+ if (Current().type == type) {
+ break;
+ }
+ ++idx;
+ }
+}
+
+void CodegenLexer::SkipUntilTokenSingleChar(char c) {
+ while (idx < tokens.size()) {
+ auto& curr = Current();
+ if (curr.type == CLEX_ext_single_char &&
+ curr.text[0] == c)
+ {
+ break;
+ }
+ ++idx;
+ }
+}