From f6ae20caf8191f78eea90edfb17f316db8f8c6b5 Mon Sep 17 00:00:00 2001 From: c+1 Date: Thu, 18 May 2023 17:57:56 -0400 Subject: finished the lexer, for real this time --- src/include/lexer.h | 13 ++-- src/include/token.h | 5 +- src/lexer.c | 190 ++++++++++++++++++++++++++-------------------------- 3 files changed, 106 insertions(+), 102 deletions(-) (limited to 'src') diff --git a/src/include/lexer.h b/src/include/lexer.h index 6d9dff8..7364910 100644 --- a/src/include/lexer.h +++ b/src/include/lexer.h @@ -5,6 +5,9 @@ #include "token.h" +#define LEXER_VALID (lexer->c != '\0' && lexer->i < strlen(lexer->content)) + + typedef struct LEXER_STRUC { char c; // current character in content unsigned int i; // index of c @@ -19,8 +22,7 @@ extern void lexer_destroy (lexer_t* lexer); // advance the lexer extern void lexer_next (lexer_t* lexer); extern void lexer_pass (lexer_t* lexer); -extern int lexer_is_valid (lexer_t* lexer); -extern token_t* lexer_get_next_token (lexer_t* lexer); +extern token_t* lexer_get_next_token (lexer_t* lexer); // chars -> tokens extern token_t* lexer_next_token (lexer_t* lexer, token_t* token); extern char* lexer_get_c_as_string (lexer_t* lexer); @@ -29,12 +31,13 @@ extern token_t* lexer_get_array (lexer_t* lexer); extern token_t* lexer_get_string (lexer_t* lexer); extern token_t* lexer_get_comment (lexer_t* lexer); -// def collectors +// special def collectors extern token_t* lexer_get_directive (lexer_t* lexer); extern token_t* lexer_get_keyword (lexer_t* lexer); -extern token_t* lexer_get_def (lexer_t* lexer); +extern token_t* lexer_get_def_var (lexer_t* lexer); // = +/* extern token_t* lexer_get_def (lexer_t* lexer); // get the definition extern token_t* lexer_get_def_mutability (lexer_t* lexer); // look for mut: or immut: (optional, default immut) extern token_t* lexer_get_def_type (lexer_t* lexer); // get the def type (required) @@ -49,6 +52,6 @@ extern token_t* lexer_get_var_value (lexer_t* lexer); // get the var extern token_t* lexer_get_fn_def (lexer_t* lexer); extern token_t* lexer_get_fn_args (lexer_t* lexer); // get the function args (optional, default empty) extern token_t* lexer_get_fn_body (lexer_t* lexer); // get the function body (required) - +*/ #endif diff --git a/src/include/token.h b/src/include/token.h index 45154f4..f7a166f 100644 --- a/src/include/token.h +++ b/src/include/token.h @@ -16,7 +16,8 @@ typedef struct TOKEN_STRUC { TOKEN_DIRECTIVE, // #DIRECTIVE; TOKEN_FN_APPLY, // . TOKEN_LIST_DELIM, // , - TOKEN_DEF_ARGS_DELIM, // : + TOKEN_DEF_TAGS_DELIM, // : + TOKEN_DEF, // def:def TOKEN_BLOCK_DELIM_START, // { TOKEN_BLOCK_DELIM_END, // } TOKEN_NAMESPACE_DELIM, // / @@ -28,6 +29,8 @@ typedef struct TOKEN_STRUC { char* value; } token_t; +token_t* token_init(int type, char* val); + char TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS[] = "abcdefghijklmnopqrstuvwxyz_"; // chars that can begin a var name int TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS_LEN = 27; // maximum efficiency! char TOKEN_DEFNAME_SPLIT_CHAR_ALLOWED_CHARS[] = "1234567890_-"; // chars that can be in the rest of the var name, diff --git a/src/lexer.c b/src/lexer.c index f51c1fa..484766f 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -9,11 +9,6 @@ #include "include/token.h" -// TODO: -// lexer_valid(lexer) -> bool -// whether at EOF, already defined just make it cleaner - - lexer_t* lexer_init(char* content) { lexer_t* lexer = calloc(1, sizeof(struct LEXER_STRUC)); @@ -24,8 +19,13 @@ lexer_t* lexer_init(char* content) { return lexer; } +void lexer_destroy(lexer_t* lexer) { + free(lexer->content); + free(lexer); +} + void lexer_next(lexer_t* lexer) { - if (lexer->c != '\0' && lexer->i < strlen(lexer->content)) { + if (LEXER_VALID) { lexer->i += 1; lexer->c = lexer->content[lexer->i]; } @@ -38,34 +38,36 @@ void lexer_pass(lexer_t* lexer) { } token_t* lexer_get_next_token(lexer_t* lexer) { - while (lexer->c != '\0' && lexer->i < strlen(lexer->content)) { + while (LEXER_VALID) { + + if (char_can_ignore(&lexer->c)) { + lexer_pass(lexer); + } - if (char_can_ignore(&lexer->c)) { lexer_pass(lexer); } - if (char_could_start_keyword(&lexer->c)) { return lexer_get_keyword(lexer); } + if (char_could_start_keyword(&lexer->c)) { + return lexer_get_keyword(lexer); + } switch (lexer->c) { - case '\'': - return lexer_get_string(lexer); break; - case '[': - return lexer_get_array(lexer); break; - case ']': return lexer_next_token( - lexer, - token_init( - TOKEN_ARRAY_DELIM_END, - lexer_get_c_as_string(lexer) - ) - ); break; + case '\'': + return lexer_get_string(lexer); + break; + case '`': + return lexer_get_comment(lexer); + break; + case ';': + return lexer_next_token( + lexer, + token_init( + TOKEN_EXPR_END, + lexer_get_c_as_string(lexer) + ) + ); + break; case '=': return lexer_get_def_const(lexer); break; case '-': return lexer_get_def_mut(lexer); break; - case ';': return lexer_next_token( - lexer, - token_init( - TOKEN_EOF, - lexer_get_c_as_string(lexer) - ) - ); break; case '(': return lexer_next_token( lexer, token_init( @@ -127,9 +129,44 @@ token_t* lexer_get_next_token(lexer_t* lexer) { return NULL; } -token_t* lexer_get_string(lexer_t* lexer) { +token_t* lexer_next_token(lexer_t* lexer, token_t* token) { lexer_next(lexer); + + return token; +} + +char* lexer_get_c_as_string(lexer_t* lexer) { + char* str = calloc(2, 1 * sizeof(char)); + str[0] = lexer->c; + str[1] = '\0'; + return str; +} + +// TODO: abstract away this kind of thing +token_t* lexer_get_array(lexer_t* lexer) { + lexer_next(lexer); // skip opening [ + char* array_so_far = calloc(1, sizeof(char)); + array_so_far[0] = '\0'; + + while (lexer->c != ']') { + char* current = lexer_get_c_as_string(lexer); + array_so_far = realloc( + array_so_far, + (strlen(array_so_far) + strlen(current) * sizeof(char)) + ); + + strcat(array_so_far, current); + lexer_next(lexer); + } + + lexer_next(lexer); // skip over closing ] + + return token_init(TOKEN_STR, array_so_far); // return the collected array +} + +token_t* lexer_get_string(lexer_t* lexer) { + lexer_next(lexer); char* str_so_far = calloc(1, sizeof(char)); str_so_far[0] = '\0'; @@ -141,13 +178,12 @@ token_t* lexer_get_string(lexer_t* lexer) { ); strcat(str_so_far, current); - lexer_next(lexer); } - lexer_next(lexer); // skip over closing " + lexer_next(lexer); // skip over closing ' - return token_init(TOKEN_STR, str_so_far); + return token_init(TOKEN_STR, str_so_far); // return the collected string } token_t* lexer_get_comment(lexer_t* lexer) { @@ -155,7 +191,7 @@ token_t* lexer_get_comment(lexer_t* lexer) { char* comment_so_far = calloc(1, sizeof(char)); - while (lexer->c != ']') { + while (lexer->c != '`') { char* current = lexer_get_c_as_string(lexer); comment_so_far = realloc( comment_so_far, @@ -166,62 +202,28 @@ token_t* lexer_get_comment(lexer_t* lexer) { lexer_next(lexer); } - lexer_next(lexer); // skip over closing ] + lexer_next(lexer); // skip over closing ` return token_init(TOKEN_COMM, comment_so_far); } -token_t* lexer_get_def_const(lexer_t* lexer) { - lexer_pass(lexer); - - if (lexer_next(lexer), lexer->c == '>') { - lexer_next(lexer); - return token_init(TOKEN_DEFINE_CONST, "=>"); - } else { - log_err("Unknown variable state."); - exit(1); - } -} - -token_t* lexer_get_def_var(lexer_t* lexer) { - lexer_pass(lexer); - - if (lexer_next(lexer), lexer->c == '=') { - return lexer_get_def_const(lexer); - } else if (lexer_next(lexer), lexer->c == '-') { - return lexer_get_def_mut(lexer); - } else { - log_err("Unknown variable state."); - exit(1); - } -} - -token_t* lexer_get_def_mut(lexer_t* lexer) { - lexer_pass(lexer); - - if (lexer_next(lexer), lexer->c == '>') { - lexer_next(lexer); - return token_init(TOKEN_DEFINE_MUT, "->"); - } else { - log_err("Unknown variable state."); - exit(1); - } -} - token_t* lexer_get_directive(lexer_t* lexer) { lexer_next(lexer); - char* directive_so_far = calloc(1, sizeof(char)); directive_so_far[0] = '\0'; while (lexer->c != ';') { char* current = lexer_get_c_as_string(lexer); - directive_so_far = realloc(directive_so_far, (strlen(directive_so_far) + strlen(current) * sizeof(char))); + directive_so_far = realloc( + directive_so_far, + (strlen(directive_so_far) + strlen(current) * sizeof(char)) + ); + strcat(directive_so_far, current); lexer_next(lexer); } - lexer_next(lexer); + lexer_next(lexer); // skip over closing ; return token_init(TOKEN_DIRECTIVE, directive_so_far); } @@ -229,37 +231,33 @@ token_t* lexer_get_directive(lexer_t* lexer) { token_t* lexer_get_keyword(lexer_t* lexer) { char* keyword_so_far = calloc(1, sizeof(char)); keyword_so_far[0] = '\0'; - - while (isalnum(lexer->c)) { + while (char_could_split_keyword(&lexer->c)) { char* current = lexer_get_c_as_string(lexer); - keyword_so_far = realloc(keyword_so_far, (strlen(keyword_so_far) + strlen(current) * sizeof(char))); + keyword_so_far = realloc( + keyword_so_far, + (strlen(keyword_so_far) + strlen(current) * sizeof(char)) + ); + strcat(keyword_so_far, current); lexer_next(lexer); } - lexer_next(lexer); - return token_init(TOKEN_KEYWORD, keyword_so_far); } -token_t* lexer_next_token(lexer_t* lexer, token_t* token) { - lexer_next(lexer); - - return token; -} - -char* lexer_get_c_as_string(lexer_t* lexer) { - char* str = calloc(2, 1 * sizeof(char)); - str[0] = lexer->c; - str[1] = '\0'; - - - return str; -} - -void lexer_destroy(lexer_t* lexer) { - free(lexer->content); - free(lexer); +token_t* lexer_get_def(lexer_t* lexer) { + char* def_so_far = calloc(1, sizeof(char)); + def_so_far[0] = '\0'; + while (lexer->c != '=') { + char* current = lexer_get_c_as_string(lexer); + def_so_far = realloc( + def_so_far, + (strlen(def_so_far) + strlen(current) * sizeof(char)) + ); + strcat(def_so_far, current); + lexer_next(lexer); + } + return token_init(TOKEN_DEF, def_so_far); } -- cgit v1.2.3