diff options
author | rtk0c <[email protected]> | 2022-06-03 23:26:44 -0700 |
---|---|---|
committer | rtk0c <[email protected]> | 2022-06-03 23:26:44 -0700 |
commit | 60ccc62f4934e44ad5b905fdbcf458302b8d8a09 (patch) | |
tree | 02ec83cc8387abfd08bd5ee7ea4e8115f1bfb8d0 /source/CodegenCompiler/CodegenLexer.cpp | |
parent | c2ef7737536bf1f8c81fcfae95c0183b21c9753f (diff) |
Changeset: 63 [WIP] Rename directories
Diffstat (limited to 'source/CodegenCompiler/CodegenLexer.cpp')
-rw-r--r-- | source/CodegenCompiler/CodegenLexer.cpp | 183 |
1 files changed, 183 insertions, 0 deletions
diff --git a/source/CodegenCompiler/CodegenLexer.cpp b/source/CodegenCompiler/CodegenLexer.cpp new file mode 100644 index 0000000..dab6aea --- /dev/null +++ b/source/CodegenCompiler/CodegenLexer.cpp @@ -0,0 +1,183 @@ +#include "CodegenLexer.hpp" + +#include <cassert> + +bool StbTokenIsSingleChar(int lexerToken) { + return lexerToken >= 0 && lexerToken < 256; +} + +bool StbTokenIsMultiChar(int lexerToken) { + return !StbTokenIsMultiChar(lexerToken); +} + +std::string CombineTokens(std::span<const StbLexerToken> tokens) { + size_t length = 0; + for (auto& token : tokens) { + length += token.text.size(); + } + std::string result; + result.reserve(length); + for (auto& token : tokens) { + result += token.text; + } + return result; +} + +const StbLexerToken& CodegenLexer::Current() const { + assert(idx < tokens.size()); + return tokens[idx]; +} + +void CodegenLexer::InitializeFrom(std::string_view source) { + this->tokens = {}; + this->idx = 0; + + stb_lexer lexer; + char stringStorage[65536]; + const char* srcBegin = source.data(); + const char* srcEnd = srcBegin + source.length(); + stb_c_lexer_init(&lexer, srcBegin, srcEnd, stringStorage, sizeof(stringStorage)); + + struct TokenCombiningPattern { + StbLexerToken result; + char matchChars[16]; + }; + + const TokenCombiningPattern kDoubleColon = { + .result = { + .text = "::", + .type = CLEX_ext_double_colon, + }, + .matchChars = { ':', ':', '\0' }, + }; + const TokenCombiningPattern kDotDotDot = { + .result = { + .text = "...", + .type = CLEX_ext_dot_dot_dot, + }, + .matchChars = { '.', '.', '.', '\0' }, + }; + + const TokenCombiningPattern* currentState = nullptr; + int currentStateCharIdx = 0; + + while (true) { + // See stb_c_lexer.h's comments, here are a few additinos that aren't made clear in the file: + // - `lexer->token` (noted as "token" below) after calling stb_c_lexer_get_token() contains either: + // 1. 0 <= token < 256: an ASCII character (more precisely a single char that the lexer ate; technically can be an incomplete code unit) + // 2. token < 0: an unknown token + // 3. One of the `CLEX_*` enums: a special, recognized token such as an operator + + int stbToken = stb_c_lexer_get_token(&lexer); + if (stbToken == 0) { + // EOF + break; + } + + if (lexer.token == CLEX_parse_error) { + printf("[ERROR] stb_c_lexer countered a parse error.\n"); + // TODO how to handle? + continue; + } + + StbLexerToken token; + if (StbTokenIsSingleChar(lexer.token)) { + char c = lexer.token; + + token.type = CLEX_ext_single_char; + token.text = std::string(1, c); + + if (!currentState) { +#define TRY_START_MATCH(states) \ + if (states.matchChars[0] == c) { \ + currentState = &states; \ + currentStateCharIdx = 1; \ + } + TRY_START_MATCH(kDoubleColon); + TRY_START_MATCH(kDotDotDot); +#undef TRY_START_MATCH + } else { + if (currentState->matchChars[currentStateCharIdx] == c) { + // Match success + ++currentStateCharIdx; + + // If we matched all of the chars... + if (currentState->matchChars[currentStateCharIdx] == '\0') { + // We matched (currentStateCharIdx) tokens though this one is pushed into the vector, leaving (currentStateCharIdx - 1) tokens to be removed + for (int i = 0, count = currentStateCharIdx - 1; i < count; ++i) { + tokens.pop_back(); + } + + // Set the current token to desired result + token = currentState->result; + + currentState = nullptr; + currentStateCharIdx = 0; + } + } else { + // Match fail, reset + + currentState = nullptr; + currentStateCharIdx = 0; + } + } + } else { + token.type = lexer.token; + // WORKAROUND: use null terminated string, stb_c_lexer doens't set string_len properly when parsing identifiers + token.text = std::string(lexer.string); + + switch (token.type) { + case CLEX_intlit: + token.lexerIntNumber = lexer.int_number; + break; + + case CLEX_floatlit: + token.lexerRealNumber = lexer.real_number; + break; + } + } + tokens.push_back(std::move(token)); + token = {}; + } +} + +const StbLexerToken* CodegenLexer::TryConsumeToken(int type) { + auto& token = tokens[idx]; + if (token.type == type) { + ++idx; + return &token; + } + return nullptr; +} + +const StbLexerToken* CodegenLexer::TryConsumeSingleCharToken(char c) { + auto& token = tokens[idx]; + if (token.type == CLEX_ext_single_char && + token.text[0] == c) + { + ++idx; + return &token; + } + return nullptr; +} + +void CodegenLexer::SkipUntilToken(int type) { + while (idx < tokens.size()) { + if (Current().type == type) { + break; + } + ++idx; + } +} + +void CodegenLexer::SkipUntilTokenSingleChar(char c) { + while (idx < tokens.size()) { + auto& curr = Current(); + if (curr.type == CLEX_ext_single_char && + curr.text[0] == c) + { + break; + } + ++idx; + } +} |