From 668e0fb0f4fc4bdd990d9ab349da445960d5764e Mon Sep 17 00:00:00 2001 From: c+1 Date: Thu, 5 Oct 2023 11:02:22 -0400 Subject: redesign the lexer, some mem leaks :( --- src/include/lexer.h | 82 ++++++++++++++------------ src/include/token.h | 78 +++++++++++------------- src/lexer.c | 167 +++++++++++----------------------------------------- src/main.c | 37 ++++++------ src/token.c | 127 ++++----------------------------------- src/util.c | 1 + 6 files changed, 144 insertions(+), 348 deletions(-) (limited to 'src') diff --git a/src/include/lexer.h b/src/include/lexer.h index 8d4f75f..173c57d 100644 --- a/src/include/lexer.h +++ b/src/include/lexer.h @@ -9,46 +9,52 @@ /* the lexer struct */ typedef struct LEXER_STRUC { - /* current character in content */ - char c; - - /* index of c */ - unsigned int i; - /* source being read */ - char* content; + char* src; + + /* what the lexer is looking at right now */ + enum LEXER_STATE { + /* normal 1-character token */ + LEXER_STATE_REG, + /* character */ + LEXER_STATE_CHR, + /* string */ + LEXER_STATE_STR, + /* definition */ + LEXER_STATE_DEF, + /* call */ + LEXER_STATE_CAL + } state; + + /* the linked list of tokens generated */ + token_t* tokenl; + int tokenc; } lexer_t; - -/* create lexer from source code */ -extern lexer_t* lexer_init (char* content); - -/* destroy the lexer */ -extern void lexer_destroy (lexer_t* lexer); - -/* move lexer forward one char */ -extern void lexer_next (lexer_t* lexer); - -/* skip useless characters */ -extern void lexer_pass (lexer_t* lexer); - -/* create tokens */ -extern token_t* lexer_get_next_token (lexer_t* lexer); - -/* create token and move 1 char */ -extern token_t* lexer_next_token (lexer_t* lexer, int token_type); - -/* create string from lexer->c */ -extern char* lexer_get_c_as_string (lexer_t* lexer); - -/* - int fskip: skip first char? - - int lskip: skip last char? -*/ -extern token_t* lexer_collect (lexer_t* lexer, int (*end_char)(char), int fskip, int lskip, int type); - -/* run lexer from source */ -lexer_t* lexer_run(lexer_t*); +/* create lexer from source */ +lexer_t* lexer_init (char* src); + +/* destroy lexer **but not src or tokenl** */ +void lexer_destroy (lexer_t* lexer); + +/* add token to tokenv */ +void lexer_add_token(lexer_t* lexer, token_t* token); +/* add the current character as a token to tokenl -- utility function for + lexer_do_reg() */ +void lexer_add_current_char(lexer_t* lexer, int type); + +/* handle regular state */ +void lexer_do_reg(lexer_t*); +/* handle character state */ +void lexer_do_chr(lexer_t*); +/* handle string state */ +void lexer_do_str(lexer_t*); +/* handle definition state */ +void lexer_do_def(lexer_t*); +/* handle call state */ +void lexer_do_cal(lexer_t*); + +/* run lexer */ +void lexer_run(lexer_t*); #endif diff --git a/src/include/token.h b/src/include/token.h index 1a307cd..802f13d 100644 --- a/src/include/token.h +++ b/src/include/token.h @@ -1,56 +1,44 @@ #ifndef TOKEN_H #define TOKEN_H -#define TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS "+-/*abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_" -#define TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS_LEN 57 -#define TOKEN_DEFNAME_SPLIT_CHAR_ALLOWED_CHARS "1234567890_-" -#define TOKEN_DEFNAME_SPLIT_CHAR_ALLOWED_CHARS_LEN 12 -#define TOKEN_CHAR_IGNORE " \t\n\r" -#define TOKEN_CHAR_IGNORE_LEN 4 -#define TOKEN_CHAR_FIRST_CHAR_INT "0123456789" - +#include "util.h" +/* token struct */ typedef struct TOKEN_STRUC { - enum TOKEN_ENUM { - TOKEN_KEYWORD, // keyword - TOKEN_PRIM_STR, // "string" - TOKEN_PRIM_INT, // 42 - TOKEN_COMM, // `comment` - TOKEN_STMNT_END, // ; - TOKEN_LGROUP, // ( - TOKEN_RGROUP, // ) - TOKEN_DIRECTIVE, // #DIRECTIVE# - TOKEN_FN_APPLY, // . - TOKEN_LIST_DELIM, // , - TOKEN_DEF_TAG, // def:def - TOKEN_BLOCK_START, // { - TOKEN_BLOCK_END, // } - TOKEN_NAMESPACE_DELIM, // / - TOKEN_ARRAY_START, // [ - TOKEN_ARRAY_END, // ] - TOKEN_DEF_SET, // = - TOKEN_UNKNOWN, // ??? - TOKEN_EOF, // \0 - } type; - - char* value; + /* token type */ + enum TOKEN_TYPE { + TOKEN_UNKNOWN, + TOKEN_CHAR_DELIM, + TOKEN_STR_DELIM, + TOKEN_COMMENT_DELIM, + TOKEN_EXPR_END, + TOKEN_SET, + TOKEN_LGROUP, + TOKEN_RGROUP, + TOKEN_APPLY, + TOKEN_LIST_DELIM, + TOKEN_TAG_DELIM, + TOKEN_NAMESPACE_DELIM, + TOKEN_LBLOCK, + TOKEN_RBLOCK, + TOKEN_RLIST, + TOKEN_LLIST, + TOKEN_ESC + } type; + + /* token value */ + char* val; + + /* next token */ + struct TOKEN_STRUC* nxt; } token_t; +/* creates a token */ token_t* token_init(int type, char* val); - -char* token_get_type(int type); - -int char_could_start_keyword(char* character); -int char_could_split_keyword(char* character); -int char_could_start_int(char* character); -int char_can_ignore(char* character); - -int token_char_quote(char c); -int token_char_grave(char c); -int token_char_pound(char c); -int token_char_colon(char c); -int token_char_kywrd(char c); - +/* destroys a token **and all tokens contained in nxt** */ void token_destroy(token_t* token); +/* return pointer to the last token */ +token_t* token_last(token_t* token); + #endif diff --git a/src/lexer.c b/src/lexer.c index 8374a90..e9475b6 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -5,12 +5,15 @@ #include "include/lexer.h" -lexer_t* lexer_init(char* content) { - lexer_t* lexer = calloc(1, sizeof(struct LEXER_STRUC)); +lexer_t* lexer_init(char* src) { + lexer_t* lexer; - lexer->content = content; - lexer->i = 0; - lexer->c = content[lexer->i]; + lexer = emalloc(sizeof(struct LEXER_STRUC)); + + lexer->src = src; + lexer->state = LEXER_STATE_REG; + lexer->tokenl = NULL; + lexer->tokenc = 0; return lexer; } @@ -19,144 +22,44 @@ void lexer_destroy(lexer_t* lexer) { free(lexer); } -void lexer_next(lexer_t* lexer) { - if (LEXER_VALID) { - lexer->i ++; - lexer->c = lexer->content[lexer->i]; - } -} +void lexer_add_token(lexer_t* lexer, token_t* token) { + token_t* t; -void lexer_pass(lexer_t* lexer) { - while (char_can_ignore(&lexer->c)) { - lexer_next(lexer); - } -} + t = token_last(lexer->tokenl)->nxt; + t = token; -token_t* lexer_get_next_token(lexer_t* lexer) { - while (LEXER_VALID) { - - if (char_can_ignore(&lexer->c)) { lexer_pass(lexer); } - if (char_could_start_int(&lexer->c)) { return lexer_next_token(lexer, TOKEN_PRIM_INT); } - if (char_could_start_keyword(&lexer->c)) { return lexer_collect(lexer, token_char_kywrd, 0, 0, TOKEN_KEYWORD); } - - switch (lexer->c) { - case SYNTAX_STR_DELIM: - return lexer_collect(lexer, token_char_quote, 1, 1, TOKEN_PRIM_STR); - break; - case SYNTAX_EXPR_END: - return lexer_next_token(lexer, TOKEN_STMNT_END); - break; - case SYNTAX_SET: - return lexer_next_token(lexer, TOKEN_DEF_SET); - break; - case SYNTAX_LGROUP: - return lexer_next_token(lexer, TOKEN_LGROUP); - break; - case SYNTAX_RGROUP: - return lexer_next_token(lexer, TOKEN_RGROUP); - break; - case SYNTAX_APPLY: - return lexer_next_token(lexer, TOKEN_FN_APPLY); - break; - case SYNTAX_LIST_DELIM: - return lexer_next_token(lexer, TOKEN_LIST_DELIM); - break; - case SYNTAX_TAG_DELIM: - return lexer_collect(lexer, token_char_kywrd, 1, 0, TOKEN_DEF_TAG); - break; - case SYNTAX_NAMESPACE_DELIM: - return lexer_next_token(lexer, TOKEN_NAMESPACE_DELIM); - break; - case SYNTAX_LBLOCK: - return lexer_next_token(lexer, TOKEN_BLOCK_START); - break; - case SYNTAX_RBLOCK: - return lexer_next_token(lexer, TOKEN_BLOCK_END); - break; - case SYNTAX_LLIST: - return lexer_next_token(lexer, TOKEN_ARRAY_START); - break; - case SYNTAX_RLIST: - return lexer_next_token(lexer, TOKEN_ARRAY_END); - break; - case '\0': - case EOF: - return token_init(TOKEN_EOF, lexer_get_c_as_string(lexer)); - break; - default: - return lexer_next_token(lexer, TOKEN_UNKNOWN); - break; - } - } + log_inf("token/v:%s\t/t:%d", token->val, token->type); - return NULL; + lexer->tokenc ++; } -token_t* lexer_next_token(lexer_t* lexer, int token_type) { - token_t* token = token_init(token_type, lexer_get_c_as_string(lexer)); - lexer_next(lexer); - return token; -} - -char* lexer_get_c_as_string(lexer_t* lexer) { - char* str; /* the string to return */ - - str = malloc(2); - str[0] = lexer->c; - str[1] = '\0'; - - return str; -} - -token_t* lexer_collect(lexer_t* lexer, int (*end_char)(char), int fskip, int lskip, int type) { - size_t len; /* length of collected token so far */ - char* token; /* collected token so far */ - - len = 0; - token = calloc(len, sizeof(char)); - - if (fskip) { lexer_next(lexer); } +void lexer_add_current_char(lexer_t* lexer, int type) { + char* c; /* get the current character as a string */ + token_t* t; /* the token to be added */ - while (end_char(lexer->c)) { - char* current; + c = ecalloc(2, sizeof(char)); + c[0] = *lexer->src; + c[1] = '\0'; - current = lexer_get_c_as_string(lexer); - token = realloc( - token, - (len + sizeof(current)) - ); + t = token_init(type, c); - memcpy(token + len, current, sizeof(char) * strlen(current)); - len += strlen(current) * sizeof(char); - lexer_next(lexer); - - free(current); - } - - if (lskip) { lexer_next(lexer); } - - token[len] = '\0'; /* terminate */ - - return token_init(type, token); + lexer_add_token(lexer, t); } -lexer_t* lexer_run(lexer_t* lexer) { - while (1) { - token_t* token; - char* type; - - token = lexer_get_next_token(lexer); - type = token_get_type(token->type); - - log_inf("type: %s\t\tval:%s", type, token->value); - - if (token->type == TOKEN_EOF) { - token_destroy(token); +void lexer_do_reg(lexer_t* lexer) { + switch (*lexer->src) { + case SYNTAX_APPLY: + lexer_add_current_char(lexer, TOKEN_APPLY); break; - } - - token_destroy(token); + default: + lexer_add_current_char(lexer, TOKEN_UNKNOWN); + } +} - return lexer; +void lexer_run(lexer_t* lexer) { + while (*lexer->src) { + if (lexer->state == LEXER_STATE_REG) { lexer_do_reg(lexer); } + lexer->src ++; + } } diff --git a/src/main.c b/src/main.c index c084002..c0a8d43 100644 --- a/src/main.c +++ b/src/main.c @@ -9,40 +9,41 @@ #include "include/hlkt.h" int main(int argc, char* argv[]) { - char* source; - - lexer_t* lexer; + char* src; /* the source "code" */ pp_t* pp; + lexer_t* lexer; - source = source_get(argv[1]); - HLKT_ASS(source); + /* get source */ + src = source_get(argv[1]); + HLKT_ASS(src); log_inf("source gotten"); - log_inf("source: %s", source); - pp = pp_init(source); + /* create pre-processor */ + pp = pp_init(src); HLKT_ASS(pp); log_inf("preprocessor created"); + /* pre-process source */ pp_run(pp); - free(source); - source = pp->psrc; - pp_destroy(pp); - HLKT_ASS(source); + free(src); + src = pp->psrc; + HLKT_ASS(src); log_inf("preprocessor ran"); - log_inf("preprocessed source: %s", source); - /* - lexer = lexer_init(source); + /* create lexer */ + lexer = lexer_init(src); HLKT_ASS(lexer); log_inf("lexer created"); + /* run lexer */ lexer_run(lexer); + log_inf("lexer ran"); + /* clean up */ + pp_destroy(pp); lexer_destroy(lexer); - */ - free(source); - /*free(pp->psrc);*/ - + token_destroy(lexer->tokenl); + free(src); HLKT_LOG(); diff --git a/src/token.c b/src/token.c index 355a6dd..26af598 100644 --- a/src/token.c +++ b/src/token.c @@ -3,129 +3,26 @@ #include "include/token.h" token_t* token_init(int type, char* val) { - token_t* token = calloc(1, sizeof(struct TOKEN_STRUC)); + token_t* token; + + token = emalloc(sizeof(struct TOKEN_STRUC)); token->type = type; - token->value = val; + token->val = val; + token->nxt = NULL; return token; } -char* token_get_type(int type) { - switch (type) { - case TOKEN_KEYWORD: - return "TOKEN_KEYWORD"; - break; - case TOKEN_PRIM_STR: - return "TOKEN_PRIM_STR"; - break; - case TOKEN_PRIM_INT: - return "TOKEN_PRIM_INT"; - break; - case TOKEN_COMM: - return "TOKEN_COMM"; - break; - case TOKEN_STMNT_END: - return "TOKEN_EXPR_END"; - break; - case TOKEN_LGROUP: - return "TOKEN_LGROUP"; - break; - case TOKEN_RGROUP: - return "TOKEN_RGROUP"; - break; - case TOKEN_DIRECTIVE: - return "TOKEN_DIRECTIVE"; - break; - case TOKEN_FN_APPLY: - return "TOKEN_FN_APPLY"; - break; - case TOKEN_LIST_DELIM: - return "TOKEN_LIST_DELIM"; - break; - case TOKEN_DEF_TAG: - return "TOKEN_DEF_TAG"; - break; - case TOKEN_BLOCK_START: - return "TOKEN_BLOCK_START"; - break; - case TOKEN_BLOCK_END: - return "TOKEN_BLOCK_END"; - break; - case TOKEN_NAMESPACE_DELIM: - return "TOKEN_NAMESPACE_DELIM"; - break; - case TOKEN_ARRAY_START: - return "TOKEN_ARRAY_START"; - break; - case TOKEN_ARRAY_END: - return "TOKEN_ARRAY_END"; - break; - case TOKEN_DEF_SET: - return "TOKEN_DEF_SET"; - break; - case TOKEN_UNKNOWN: - return "TOKEN_UNKNOWN"; - break; - case TOKEN_EOF: - return "TOKEN_EOF"; - break; - default: - return "???"; - } -} - -int char_could_start_keyword(char* character) { - for (int i = 0; i < TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS_LEN; ++ i) { - if (TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS[i] == *character) { - return 1; - } - } - - return 0; -} - -int char_could_split_keyword(char* character) { - if (char_could_start_keyword(character)) { - return 1; - } else { - for (int i = 0; i < TOKEN_DEFNAME_SPLIT_CHAR_ALLOWED_CHARS_LEN; ++ i) { - if (TOKEN_DEFNAME_SPLIT_CHAR_ALLOWED_CHARS[i] == *character) { - return 1; - } - } - - return 0; - } +void token_destroy(token_t* token) { + free(token); } -int char_could_start_int(char* character) { - for (int i = 0; i < 10; ++ i) { - if (TOKEN_CHAR_FIRST_CHAR_INT[i] == *character) { - return 1; - } - } +token_t* token_last(token_t* token) { + token_t* t; - return 0; -} - -int char_can_ignore(char* character) { - for (int i = 0; i < TOKEN_CHAR_IGNORE_LEN; ++ i) { - if (TOKEN_CHAR_IGNORE[i] == *character) { - return 1; - } + while (t->nxt) { + t = t->nxt; } - return 0; -} - - -int token_char_quote(char c) { return (c != '\''); } -int token_char_grave(char c) { return (c != '`'); } -int token_char_pound(char c) { return (c != '#'); } -int token_char_colon(char c) { return (c != ':'); } -int token_char_kywrd(char c) { return (char_could_split_keyword(&c)); } - -void token_destroy(token_t* token) { - free(token->value); - free(token); + return t; } diff --git a/src/util.c b/src/util.c index cee2beb..a175f6b 100644 --- a/src/util.c +++ b/src/util.c @@ -1,5 +1,6 @@ #include "include/util.h" + void die(const char* fmt, ...) { va_list ap; -- cgit v1.2.3