aboutsummaryrefslogtreecommitdiff
path: root/source/20-codegen-compiler/CodegenLexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'source/20-codegen-compiler/CodegenLexer.cpp')
-rw-r--r--source/20-codegen-compiler/CodegenLexer.cpp183
1 files changed, 183 insertions, 0 deletions
diff --git a/source/20-codegen-compiler/CodegenLexer.cpp b/source/20-codegen-compiler/CodegenLexer.cpp
new file mode 100644
index 0000000..dab6aea
--- /dev/null
+++ b/source/20-codegen-compiler/CodegenLexer.cpp
@@ -0,0 +1,183 @@
+#include "CodegenLexer.hpp"
+
+#include <cassert>
+
+bool StbTokenIsSingleChar(int lexerToken) {
+ return lexerToken >= 0 && lexerToken < 256;
+}
+
+bool StbTokenIsMultiChar(int lexerToken) {
+ return !StbTokenIsMultiChar(lexerToken);
+}
+
+std::string CombineTokens(std::span<const StbLexerToken> tokens) {
+ size_t length = 0;
+ for (auto& token : tokens) {
+ length += token.text.size();
+ }
+ std::string result;
+ result.reserve(length);
+ for (auto& token : tokens) {
+ result += token.text;
+ }
+ return result;
+}
+
+const StbLexerToken& CodegenLexer::Current() const {
+ assert(idx < tokens.size());
+ return tokens[idx];
+}
+
+void CodegenLexer::InitializeFrom(std::string_view source) {
+ this->tokens = {};
+ this->idx = 0;
+
+ stb_lexer lexer;
+ char stringStorage[65536];
+ const char* srcBegin = source.data();
+ const char* srcEnd = srcBegin + source.length();
+ stb_c_lexer_init(&lexer, srcBegin, srcEnd, stringStorage, sizeof(stringStorage));
+
+ struct TokenCombiningPattern {
+ StbLexerToken result;
+ char matchChars[16];
+ };
+
+ const TokenCombiningPattern kDoubleColon = {
+ .result = {
+ .text = "::",
+ .type = CLEX_ext_double_colon,
+ },
+ .matchChars = { ':', ':', '\0' },
+ };
+ const TokenCombiningPattern kDotDotDot = {
+ .result = {
+ .text = "...",
+ .type = CLEX_ext_dot_dot_dot,
+ },
+ .matchChars = { '.', '.', '.', '\0' },
+ };
+
+ const TokenCombiningPattern* currentState = nullptr;
+ int currentStateCharIdx = 0;
+
+ while (true) {
+ // See stb_c_lexer.h's comments, here are a few additinos that aren't made clear in the file:
+ // - `lexer->token` (noted as "token" below) after calling stb_c_lexer_get_token() contains either:
+ // 1. 0 <= token < 256: an ASCII character (more precisely a single char that the lexer ate; technically can be an incomplete code unit)
+ // 2. token < 0: an unknown token
+ // 3. One of the `CLEX_*` enums: a special, recognized token such as an operator
+
+ int stbToken = stb_c_lexer_get_token(&lexer);
+ if (stbToken == 0) {
+ // EOF
+ break;
+ }
+
+ if (lexer.token == CLEX_parse_error) {
+ printf("[ERROR] stb_c_lexer countered a parse error.\n");
+ // TODO how to handle?
+ continue;
+ }
+
+ StbLexerToken token;
+ if (StbTokenIsSingleChar(lexer.token)) {
+ char c = lexer.token;
+
+ token.type = CLEX_ext_single_char;
+ token.text = std::string(1, c);
+
+ if (!currentState) {
+#define TRY_START_MATCH(states) \
+ if (states.matchChars[0] == c) { \
+ currentState = &states; \
+ currentStateCharIdx = 1; \
+ }
+ TRY_START_MATCH(kDoubleColon);
+ TRY_START_MATCH(kDotDotDot);
+#undef TRY_START_MATCH
+ } else {
+ if (currentState->matchChars[currentStateCharIdx] == c) {
+ // Match success
+ ++currentStateCharIdx;
+
+ // If we matched all of the chars...
+ if (currentState->matchChars[currentStateCharIdx] == '\0') {
+ // We matched (currentStateCharIdx) tokens though this one is pushed into the vector, leaving (currentStateCharIdx - 1) tokens to be removed
+ for (int i = 0, count = currentStateCharIdx - 1; i < count; ++i) {
+ tokens.pop_back();
+ }
+
+ // Set the current token to desired result
+ token = currentState->result;
+
+ currentState = nullptr;
+ currentStateCharIdx = 0;
+ }
+ } else {
+ // Match fail, reset
+
+ currentState = nullptr;
+ currentStateCharIdx = 0;
+ }
+ }
+ } else {
+ token.type = lexer.token;
+ // WORKAROUND: use null terminated string, stb_c_lexer doens't set string_len properly when parsing identifiers
+ token.text = std::string(lexer.string);
+
+ switch (token.type) {
+ case CLEX_intlit:
+ token.lexerIntNumber = lexer.int_number;
+ break;
+
+ case CLEX_floatlit:
+ token.lexerRealNumber = lexer.real_number;
+ break;
+ }
+ }
+ tokens.push_back(std::move(token));
+ token = {};
+ }
+}
+
+const StbLexerToken* CodegenLexer::TryConsumeToken(int type) {
+ auto& token = tokens[idx];
+ if (token.type == type) {
+ ++idx;
+ return &token;
+ }
+ return nullptr;
+}
+
+const StbLexerToken* CodegenLexer::TryConsumeSingleCharToken(char c) {
+ auto& token = tokens[idx];
+ if (token.type == CLEX_ext_single_char &&
+ token.text[0] == c)
+ {
+ ++idx;
+ return &token;
+ }
+ return nullptr;
+}
+
+void CodegenLexer::SkipUntilToken(int type) {
+ while (idx < tokens.size()) {
+ if (Current().type == type) {
+ break;
+ }
+ ++idx;
+ }
+}
+
+void CodegenLexer::SkipUntilTokenSingleChar(char c) {
+ while (idx < tokens.size()) {
+ auto& curr = Current();
+ if (curr.type == CLEX_ext_single_char &&
+ curr.text[0] == c)
+ {
+ break;
+ }
+ ++idx;
+ }
+}