#include "CodegenLexer.hpp" #include int StbLexerToken::Reamalgamate() const { if (type == CLEX_ext_single_char) { return text[0]; } else { return type; } } bool StbTokenIsSingleChar(int lexerToken) { return lexerToken >= 0 && lexerToken < 256; } bool StbTokenIsMultiChar(int lexerToken) { return !StbTokenIsMultiChar(lexerToken); } std::string CombineTokens(std::span tokens, std::string_view separator) { if (tokens.empty()) { return {}; } size_t length = 0; for (auto& token : tokens) { length += token.text.size(); length += separator.size(); } // Intentionally counting an extra separator: leave space for the last append below std::string result; result.reserve(length); for (auto& token : tokens) { result += token.text; result += separator; } // Remove the trailing separator result.resize(result.size() - separator.size()); return result; } const StbLexerToken& CodegenLexer::Current() const { assert(idx < tokens.size()); return tokens[idx]; } void CodegenLexer::InitializeFrom(std::string_view source) { this->tokens = {}; this->idx = 0; stb_lexer lexer; char stringStorage[65536]; const char* srcBegin = source.data(); const char* srcEnd = srcBegin + source.length(); stb_c_lexer_init(&lexer, srcBegin, srcEnd, stringStorage, sizeof(stringStorage)); struct TokenCombiningPattern { StbLexerToken result; char matchChars[16]; }; const TokenCombiningPattern kDoubleColon = { .result = { .text = "::", .type = CLEX_ext_double_colon, }, .matchChars = { ':', ':', '\0' }, }; const TokenCombiningPattern kDotDotDot = { .result = { .text = "...", .type = CLEX_ext_dot_dot_dot, }, .matchChars = { '.', '.', '.', '\0' }, }; const TokenCombiningPattern* currentState = nullptr; int currentStateCharIdx = 0; while (true) { // See stb_c_lexer.h's comments, here are a few additinos that aren't made clear in the file: // - `lexer->token` (noted as "token" below) after calling stb_c_lexer_get_token() contains either: // 1. 0 <= token < 256: an ASCII character (more precisely a single char that the lexer ate; technically can be an incomplete code unit) // 2. token < 0: an unknown token // 3. One of the `CLEX_*` enums: a special, recognized token such as an operator int stbToken = stb_c_lexer_get_token(&lexer); if (stbToken == 0) { // EOF break; } if (lexer.token == CLEX_parse_error) { printf("[ERROR] stb_c_lexer countered a parse error.\n"); // TODO how to handle? continue; } StbLexerToken token; if (StbTokenIsSingleChar(lexer.token)) { char c = lexer.token; token.type = CLEX_ext_single_char; token.text = std::string(1, c); if (!currentState) { #define TRY_START_MATCH(states) \ if (states.matchChars[0] == c) { \ currentState = &states; \ currentStateCharIdx = 1; \ } TRY_START_MATCH(kDoubleColon); TRY_START_MATCH(kDotDotDot); #undef TRY_START_MATCH } else { if (currentState->matchChars[currentStateCharIdx] == c) { // Match success ++currentStateCharIdx; // If we matched all of the chars... if (currentState->matchChars[currentStateCharIdx] == '\0') { // We matched (currentStateCharIdx) tokens though this one is pushed into the vector, leaving (currentStateCharIdx - 1) tokens to be removed for (int i = 0, count = currentStateCharIdx - 1; i < count; ++i) { tokens.pop_back(); } // Set the current token to desired result token = currentState->result; currentState = nullptr; currentStateCharIdx = 0; } } else { // Match fail, reset currentState = nullptr; currentStateCharIdx = 0; } } } else { token.type = lexer.token; // WORKAROUND: use null terminated string, stb_c_lexer doens't set string_len properly when parsing identifiers token.text = std::string(lexer.string); switch (token.type) { case CLEX_intlit: token.lexerIntNumber = lexer.int_number; break; case CLEX_floatlit: token.lexerRealNumber = lexer.real_number; break; } } tokens.push_back(std::move(token)); token = {}; } } const StbLexerToken* CodegenLexer::TryConsumeToken(int type) { auto& token = tokens[idx]; if (token.type == type) { ++idx; return &token; } return nullptr; } const StbLexerToken* CodegenLexer::TryConsumeSingleCharToken(char c) { auto& token = tokens[idx]; if (token.type == CLEX_ext_single_char && token.text[0] == c) { ++idx; return &token; } return nullptr; } void CodegenLexer::SkipUntilToken(int type) { while (idx < tokens.size()) { if (Current().type == type) { break; } ++idx; } } void CodegenLexer::SkipUntilTokenSingleChar(char c) { while (idx < tokens.size()) { auto& curr = Current(); if (curr.type == CLEX_ext_single_char && curr.text[0] == c) { break; } ++idx; } }