From 297232d21594b138bb368a42b5b0d085ff9ed6aa Mon Sep 17 00:00:00 2001 From: rtk0c Date: Thu, 19 Oct 2023 22:50:07 -0700 Subject: The great renaming: switch to "module style" --- src/brussel.codegen.comp/CodegenLexer.cpp | 202 ++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 src/brussel.codegen.comp/CodegenLexer.cpp (limited to 'src/brussel.codegen.comp/CodegenLexer.cpp') diff --git a/src/brussel.codegen.comp/CodegenLexer.cpp b/src/brussel.codegen.comp/CodegenLexer.cpp new file mode 100644 index 0000000..ecb2186 --- /dev/null +++ b/src/brussel.codegen.comp/CodegenLexer.cpp @@ -0,0 +1,202 @@ +#include "CodegenLexer.hpp" + +#include + +int StbLexerToken::Reamalgamate() const { + if (type == CLEX_ext_single_char) { + return text[0]; + } else { + return type; + } +} + +bool StbTokenIsSingleChar(int lexerToken) { + return lexerToken >= 0 && lexerToken < 256; +} + +bool StbTokenIsMultiChar(int lexerToken) { + return !StbTokenIsMultiChar(lexerToken); +} + +std::string CombineTokens(std::span tokens, std::string_view separator) { + if (tokens.empty()) { + return {}; + } + + size_t length = 0; + for (auto& token : tokens) { + length += token.text.size(); + length += separator.size(); + } + // Intentionally counting an extra separator: leave space for the last append below + + std::string result; + result.reserve(length); + for (auto& token : tokens) { + result += token.text; + result += separator; + } + // Remove the trailing separator + result.resize(result.size() - separator.size()); + + return result; +} + +const StbLexerToken& CodegenLexer::Current() const { + assert(idx < tokens.size()); + return tokens[idx]; +} + +void CodegenLexer::InitializeFrom(std::string_view source) { + this->tokens = {}; + this->idx = 0; + + stb_lexer lexer; + char stringStorage[65536]; + const char* srcBegin = source.data(); + const char* srcEnd = srcBegin + source.length(); + stb_c_lexer_init(&lexer, srcBegin, srcEnd, stringStorage, sizeof(stringStorage)); + + struct TokenCombiningPattern { + StbLexerToken result; + char matchChars[16]; + }; + + const TokenCombiningPattern kDoubleColon = { + .result = { + .text = "::", + .type = CLEX_ext_double_colon, + }, + .matchChars = { ':', ':', '\0' }, + }; + const TokenCombiningPattern kDotDotDot = { + .result = { + .text = "...", + .type = CLEX_ext_dot_dot_dot, + }, + .matchChars = { '.', '.', '.', '\0' }, + }; + + const TokenCombiningPattern* currentState = nullptr; + int currentStateCharIdx = 0; + + while (true) { + // See stb_c_lexer.h's comments, here are a few additinos that aren't made clear in the file: + // - `lexer->token` (noted as "token" below) after calling stb_c_lexer_get_token() contains either: + // 1. 0 <= token < 256: an ASCII character (more precisely a single char that the lexer ate; technically can be an incomplete code unit) + // 2. token < 0: an unknown token + // 3. One of the `CLEX_*` enums: a special, recognized token such as an operator + + int stbToken = stb_c_lexer_get_token(&lexer); + if (stbToken == 0) { + // EOF + break; + } + + if (lexer.token == CLEX_parse_error) { + printf("[ERROR] stb_c_lexer countered a parse error.\n"); + // TODO how to handle? + continue; + } + + StbLexerToken token; + if (StbTokenIsSingleChar(lexer.token)) { + char c = lexer.token; + + token.type = CLEX_ext_single_char; + token.text = std::string(1, c); + + if (!currentState) { +#define TRY_START_MATCH(states) \ + if (states.matchChars[0] == c) { \ + currentState = &states; \ + currentStateCharIdx = 1; \ + } + TRY_START_MATCH(kDoubleColon); + TRY_START_MATCH(kDotDotDot); +#undef TRY_START_MATCH + } else { + if (currentState->matchChars[currentStateCharIdx] == c) { + // Match success + ++currentStateCharIdx; + + // If we matched all of the chars... + if (currentState->matchChars[currentStateCharIdx] == '\0') { + // We matched (currentStateCharIdx) tokens though this one is pushed into the vector, leaving (currentStateCharIdx - 1) tokens to be removed + for (int i = 0, count = currentStateCharIdx - 1; i < count; ++i) { + tokens.pop_back(); + } + + // Set the current token to desired result + token = currentState->result; + + currentState = nullptr; + currentStateCharIdx = 0; + } + } else { + // Match fail, reset + + currentState = nullptr; + currentStateCharIdx = 0; + } + } + } else { + token.type = lexer.token; + // WORKAROUND: use null terminated string, stb_c_lexer doens't set string_len properly when parsing identifiers + token.text = std::string(lexer.string); + + switch (token.type) { + case CLEX_intlit: + token.lexerIntNumber = lexer.int_number; + break; + + case CLEX_floatlit: + token.lexerRealNumber = lexer.real_number; + break; + } + } + tokens.push_back(std::move(token)); + token = {}; + } +} + +const StbLexerToken* CodegenLexer::TryConsumeToken(int type) { + auto& token = tokens[idx]; + if (token.type == type) { + ++idx; + return &token; + } + return nullptr; +} + +const StbLexerToken* CodegenLexer::TryConsumeSingleCharToken(char c) { + auto& token = tokens[idx]; + if (token.type == CLEX_ext_single_char && + token.text[0] == c) + { + ++idx; + return &token; + } + return nullptr; +} + +void CodegenLexer::SkipUntilToken(int type) { + while (idx < tokens.size()) { + if (Current().type == type) { + break; + } + ++idx; + } +} + +void CodegenLexer::SkipUntilTokenSingleChar(char c) { + while (idx < tokens.size()) { + auto& curr = Current(); + if (curr.type == CLEX_ext_single_char && + curr.text[0] == c) + { + break; + } + ++idx; + } +} -- cgit v1.2.3-70-g09d2