From f6ae20caf8191f78eea90edfb17f316db8f8c6b5 Mon Sep 17 00:00:00 2001 From: c+1 Date: Thu, 18 May 2023 17:57:56 -0400 Subject: finished the lexer, for real this time --- src/lexer.c | 190 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 94 insertions(+), 96 deletions(-) (limited to 'src/lexer.c') diff --git a/src/lexer.c b/src/lexer.c index f51c1fa..484766f 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -9,11 +9,6 @@ #include "include/token.h" -// TODO: -// lexer_valid(lexer) -> bool -// whether at EOF, already defined just make it cleaner - - lexer_t* lexer_init(char* content) { lexer_t* lexer = calloc(1, sizeof(struct LEXER_STRUC)); @@ -24,8 +19,13 @@ lexer_t* lexer_init(char* content) { return lexer; } +void lexer_destroy(lexer_t* lexer) { + free(lexer->content); + free(lexer); +} + void lexer_next(lexer_t* lexer) { - if (lexer->c != '\0' && lexer->i < strlen(lexer->content)) { + if (LEXER_VALID) { lexer->i += 1; lexer->c = lexer->content[lexer->i]; } @@ -38,34 +38,36 @@ void lexer_pass(lexer_t* lexer) { } token_t* lexer_get_next_token(lexer_t* lexer) { - while (lexer->c != '\0' && lexer->i < strlen(lexer->content)) { + while (LEXER_VALID) { + + if (char_can_ignore(&lexer->c)) { + lexer_pass(lexer); + } - if (char_can_ignore(&lexer->c)) { lexer_pass(lexer); } - if (char_could_start_keyword(&lexer->c)) { return lexer_get_keyword(lexer); } + if (char_could_start_keyword(&lexer->c)) { + return lexer_get_keyword(lexer); + } switch (lexer->c) { - case '\'': - return lexer_get_string(lexer); break; - case '[': - return lexer_get_array(lexer); break; - case ']': return lexer_next_token( - lexer, - token_init( - TOKEN_ARRAY_DELIM_END, - lexer_get_c_as_string(lexer) - ) - ); break; + case '\'': + return lexer_get_string(lexer); + break; + case '`': + return lexer_get_comment(lexer); + break; + case ';': + return lexer_next_token( + lexer, + token_init( + TOKEN_EXPR_END, + lexer_get_c_as_string(lexer) + ) + ); + break; case '=': return lexer_get_def_const(lexer); break; case '-': return lexer_get_def_mut(lexer); break; - case ';': return lexer_next_token( - lexer, - token_init( - TOKEN_EOF, - lexer_get_c_as_string(lexer) - ) - ); break; case '(': return lexer_next_token( lexer, token_init( @@ -127,9 +129,44 @@ token_t* lexer_get_next_token(lexer_t* lexer) { return NULL; } -token_t* lexer_get_string(lexer_t* lexer) { +token_t* lexer_next_token(lexer_t* lexer, token_t* token) { lexer_next(lexer); + + return token; +} + +char* lexer_get_c_as_string(lexer_t* lexer) { + char* str = calloc(2, 1 * sizeof(char)); + str[0] = lexer->c; + str[1] = '\0'; + return str; +} + +// TODO: abstract away this kind of thing +token_t* lexer_get_array(lexer_t* lexer) { + lexer_next(lexer); // skip opening [ + char* array_so_far = calloc(1, sizeof(char)); + array_so_far[0] = '\0'; + + while (lexer->c != ']') { + char* current = lexer_get_c_as_string(lexer); + array_so_far = realloc( + array_so_far, + (strlen(array_so_far) + strlen(current) * sizeof(char)) + ); + + strcat(array_so_far, current); + lexer_next(lexer); + } + + lexer_next(lexer); // skip over closing ] + + return token_init(TOKEN_STR, array_so_far); // return the collected array +} + +token_t* lexer_get_string(lexer_t* lexer) { + lexer_next(lexer); char* str_so_far = calloc(1, sizeof(char)); str_so_far[0] = '\0'; @@ -141,13 +178,12 @@ token_t* lexer_get_string(lexer_t* lexer) { ); strcat(str_so_far, current); - lexer_next(lexer); } - lexer_next(lexer); // skip over closing " + lexer_next(lexer); // skip over closing ' - return token_init(TOKEN_STR, str_so_far); + return token_init(TOKEN_STR, str_so_far); // return the collected string } token_t* lexer_get_comment(lexer_t* lexer) { @@ -155,7 +191,7 @@ token_t* lexer_get_comment(lexer_t* lexer) { char* comment_so_far = calloc(1, sizeof(char)); - while (lexer->c != ']') { + while (lexer->c != '`') { char* current = lexer_get_c_as_string(lexer); comment_so_far = realloc( comment_so_far, @@ -166,62 +202,28 @@ token_t* lexer_get_comment(lexer_t* lexer) { lexer_next(lexer); } - lexer_next(lexer); // skip over closing ] + lexer_next(lexer); // skip over closing ` return token_init(TOKEN_COMM, comment_so_far); } -token_t* lexer_get_def_const(lexer_t* lexer) { - lexer_pass(lexer); - - if (lexer_next(lexer), lexer->c == '>') { - lexer_next(lexer); - return token_init(TOKEN_DEFINE_CONST, "=>"); - } else { - log_err("Unknown variable state."); - exit(1); - } -} - -token_t* lexer_get_def_var(lexer_t* lexer) { - lexer_pass(lexer); - - if (lexer_next(lexer), lexer->c == '=') { - return lexer_get_def_const(lexer); - } else if (lexer_next(lexer), lexer->c == '-') { - return lexer_get_def_mut(lexer); - } else { - log_err("Unknown variable state."); - exit(1); - } -} - -token_t* lexer_get_def_mut(lexer_t* lexer) { - lexer_pass(lexer); - - if (lexer_next(lexer), lexer->c == '>') { - lexer_next(lexer); - return token_init(TOKEN_DEFINE_MUT, "->"); - } else { - log_err("Unknown variable state."); - exit(1); - } -} - token_t* lexer_get_directive(lexer_t* lexer) { lexer_next(lexer); - char* directive_so_far = calloc(1, sizeof(char)); directive_so_far[0] = '\0'; while (lexer->c != ';') { char* current = lexer_get_c_as_string(lexer); - directive_so_far = realloc(directive_so_far, (strlen(directive_so_far) + strlen(current) * sizeof(char))); + directive_so_far = realloc( + directive_so_far, + (strlen(directive_so_far) + strlen(current) * sizeof(char)) + ); + strcat(directive_so_far, current); lexer_next(lexer); } - lexer_next(lexer); + lexer_next(lexer); // skip over closing ; return token_init(TOKEN_DIRECTIVE, directive_so_far); } @@ -229,37 +231,33 @@ token_t* lexer_get_directive(lexer_t* lexer) { token_t* lexer_get_keyword(lexer_t* lexer) { char* keyword_so_far = calloc(1, sizeof(char)); keyword_so_far[0] = '\0'; - - while (isalnum(lexer->c)) { + while (char_could_split_keyword(&lexer->c)) { char* current = lexer_get_c_as_string(lexer); - keyword_so_far = realloc(keyword_so_far, (strlen(keyword_so_far) + strlen(current) * sizeof(char))); + keyword_so_far = realloc( + keyword_so_far, + (strlen(keyword_so_far) + strlen(current) * sizeof(char)) + ); + strcat(keyword_so_far, current); lexer_next(lexer); } - lexer_next(lexer); - return token_init(TOKEN_KEYWORD, keyword_so_far); } -token_t* lexer_next_token(lexer_t* lexer, token_t* token) { - lexer_next(lexer); - - return token; -} - -char* lexer_get_c_as_string(lexer_t* lexer) { - char* str = calloc(2, 1 * sizeof(char)); - str[0] = lexer->c; - str[1] = '\0'; - - - return str; -} - -void lexer_destroy(lexer_t* lexer) { - free(lexer->content); - free(lexer); +token_t* lexer_get_def(lexer_t* lexer) { + char* def_so_far = calloc(1, sizeof(char)); + def_so_far[0] = '\0'; + while (lexer->c != '=') { + char* current = lexer_get_c_as_string(lexer); + def_so_far = realloc( + def_so_far, + (strlen(def_so_far) + strlen(current) * sizeof(char)) + ); + strcat(def_so_far, current); + lexer_next(lexer); + } + return token_init(TOKEN_DEF, def_so_far); } -- cgit v1.2.3