aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorc+12023-10-05 11:02:22 -0400
committerc+12023-10-05 11:02:22 -0400
commit668e0fb0f4fc4bdd990d9ab349da445960d5764e (patch)
treea6411de1b7806d6cb91d84158cd7bc513dee235b /src
parent4d9dd289951589ddf408fdec62245b20cfe199c7 (diff)
redesign the lexer, some mem leaks :(
Diffstat (limited to 'src')
-rw-r--r--src/include/lexer.h82
-rw-r--r--src/include/token.h78
-rw-r--r--src/lexer.c167
-rw-r--r--src/main.c37
-rw-r--r--src/token.c127
-rw-r--r--src/util.c1
6 files changed, 144 insertions, 348 deletions
diff --git a/src/include/lexer.h b/src/include/lexer.h
index 8d4f75f..173c57d 100644
--- a/src/include/lexer.h
+++ b/src/include/lexer.h
@@ -9,46 +9,52 @@
/* the lexer struct */
typedef struct LEXER_STRUC {
- /* current character in content */
- char c;
-
- /* index of c */
- unsigned int i;
-
/* source being read */
- char* content;
+ char* src;
+
+ /* what the lexer is looking at right now */
+ enum LEXER_STATE {
+ /* normal 1-character token */
+ LEXER_STATE_REG,
+ /* character */
+ LEXER_STATE_CHR,
+ /* string */
+ LEXER_STATE_STR,
+ /* definition */
+ LEXER_STATE_DEF,
+ /* call */
+ LEXER_STATE_CAL
+ } state;
+
+ /* the linked list of tokens generated */
+ token_t* tokenl;
+ int tokenc;
} lexer_t;
-
-/* create lexer from source code */
-extern lexer_t* lexer_init (char* content);
-
-/* destroy the lexer */
-extern void lexer_destroy (lexer_t* lexer);
-
-/* move lexer forward one char */
-extern void lexer_next (lexer_t* lexer);
-
-/* skip useless characters */
-extern void lexer_pass (lexer_t* lexer);
-
-/* create tokens */
-extern token_t* lexer_get_next_token (lexer_t* lexer);
-
-/* create token and move 1 char */
-extern token_t* lexer_next_token (lexer_t* lexer, int token_type);
-
-/* create string from lexer->c */
-extern char* lexer_get_c_as_string (lexer_t* lexer);
-
-/*
- int fskip: skip first char?
-
- int lskip: skip last char?
-*/
-extern token_t* lexer_collect (lexer_t* lexer, int (*end_char)(char), int fskip, int lskip, int type);
-
-/* run lexer from source */
-lexer_t* lexer_run(lexer_t*);
+/* create lexer from source */
+lexer_t* lexer_init (char* src);
+
+/* destroy lexer **but not src or tokenl** */
+void lexer_destroy (lexer_t* lexer);
+
+/* add token to tokenv */
+void lexer_add_token(lexer_t* lexer, token_t* token);
+/* add the current character as a token to tokenl -- utility function for
+ lexer_do_reg() */
+void lexer_add_current_char(lexer_t* lexer, int type);
+
+/* handle regular state */
+void lexer_do_reg(lexer_t*);
+/* handle character state */
+void lexer_do_chr(lexer_t*);
+/* handle string state */
+void lexer_do_str(lexer_t*);
+/* handle definition state */
+void lexer_do_def(lexer_t*);
+/* handle call state */
+void lexer_do_cal(lexer_t*);
+
+/* run lexer */
+void lexer_run(lexer_t*);
#endif
diff --git a/src/include/token.h b/src/include/token.h
index 1a307cd..802f13d 100644
--- a/src/include/token.h
+++ b/src/include/token.h
@@ -1,56 +1,44 @@
#ifndef TOKEN_H
#define TOKEN_H
-#define TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS "+-/*abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"
-#define TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS_LEN 57
-#define TOKEN_DEFNAME_SPLIT_CHAR_ALLOWED_CHARS "1234567890_-"
-#define TOKEN_DEFNAME_SPLIT_CHAR_ALLOWED_CHARS_LEN 12
-#define TOKEN_CHAR_IGNORE " \t\n\r"
-#define TOKEN_CHAR_IGNORE_LEN 4
-#define TOKEN_CHAR_FIRST_CHAR_INT "0123456789"
-
+#include "util.h"
+/* token struct */
typedef struct TOKEN_STRUC {
- enum TOKEN_ENUM {
- TOKEN_KEYWORD, // keyword
- TOKEN_PRIM_STR, // "string"
- TOKEN_PRIM_INT, // 42
- TOKEN_COMM, // `comment`
- TOKEN_STMNT_END, // ;
- TOKEN_LGROUP, // (
- TOKEN_RGROUP, // )
- TOKEN_DIRECTIVE, // #DIRECTIVE#
- TOKEN_FN_APPLY, // .
- TOKEN_LIST_DELIM, // ,
- TOKEN_DEF_TAG, // def:def
- TOKEN_BLOCK_START, // {
- TOKEN_BLOCK_END, // }
- TOKEN_NAMESPACE_DELIM, // /
- TOKEN_ARRAY_START, // [
- TOKEN_ARRAY_END, // ]
- TOKEN_DEF_SET, // =
- TOKEN_UNKNOWN, // ???
- TOKEN_EOF, // \0
- } type;
-
- char* value;
+ /* token type */
+ enum TOKEN_TYPE {
+ TOKEN_UNKNOWN,
+ TOKEN_CHAR_DELIM,
+ TOKEN_STR_DELIM,
+ TOKEN_COMMENT_DELIM,
+ TOKEN_EXPR_END,
+ TOKEN_SET,
+ TOKEN_LGROUP,
+ TOKEN_RGROUP,
+ TOKEN_APPLY,
+ TOKEN_LIST_DELIM,
+ TOKEN_TAG_DELIM,
+ TOKEN_NAMESPACE_DELIM,
+ TOKEN_LBLOCK,
+ TOKEN_RBLOCK,
+ TOKEN_RLIST,
+ TOKEN_LLIST,
+ TOKEN_ESC
+ } type;
+
+ /* token value */
+ char* val;
+
+ /* next token */
+ struct TOKEN_STRUC* nxt;
} token_t;
+/* creates a token */
token_t* token_init(int type, char* val);
-
-char* token_get_type(int type);
-
-int char_could_start_keyword(char* character);
-int char_could_split_keyword(char* character);
-int char_could_start_int(char* character);
-int char_can_ignore(char* character);
-
-int token_char_quote(char c);
-int token_char_grave(char c);
-int token_char_pound(char c);
-int token_char_colon(char c);
-int token_char_kywrd(char c);
-
+/* destroys a token **and all tokens contained in nxt** */
void token_destroy(token_t* token);
+/* return pointer to the last token */
+token_t* token_last(token_t* token);
+
#endif
diff --git a/src/lexer.c b/src/lexer.c
index 8374a90..e9475b6 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -5,12 +5,15 @@
#include "include/lexer.h"
-lexer_t* lexer_init(char* content) {
- lexer_t* lexer = calloc(1, sizeof(struct LEXER_STRUC));
+lexer_t* lexer_init(char* src) {
+ lexer_t* lexer;
- lexer->content = content;
- lexer->i = 0;
- lexer->c = content[lexer->i];
+ lexer = emalloc(sizeof(struct LEXER_STRUC));
+
+ lexer->src = src;
+ lexer->state = LEXER_STATE_REG;
+ lexer->tokenl = NULL;
+ lexer->tokenc = 0;
return lexer;
}
@@ -19,144 +22,44 @@ void lexer_destroy(lexer_t* lexer) {
free(lexer);
}
-void lexer_next(lexer_t* lexer) {
- if (LEXER_VALID) {
- lexer->i ++;
- lexer->c = lexer->content[lexer->i];
- }
-}
+void lexer_add_token(lexer_t* lexer, token_t* token) {
+ token_t* t;
-void lexer_pass(lexer_t* lexer) {
- while (char_can_ignore(&lexer->c)) {
- lexer_next(lexer);
- }
-}
+ t = token_last(lexer->tokenl)->nxt;
+ t = token;
-token_t* lexer_get_next_token(lexer_t* lexer) {
- while (LEXER_VALID) {
-
- if (char_can_ignore(&lexer->c)) { lexer_pass(lexer); }
- if (char_could_start_int(&lexer->c)) { return lexer_next_token(lexer, TOKEN_PRIM_INT); }
- if (char_could_start_keyword(&lexer->c)) { return lexer_collect(lexer, token_char_kywrd, 0, 0, TOKEN_KEYWORD); }
-
- switch (lexer->c) {
- case SYNTAX_STR_DELIM:
- return lexer_collect(lexer, token_char_quote, 1, 1, TOKEN_PRIM_STR);
- break;
- case SYNTAX_EXPR_END:
- return lexer_next_token(lexer, TOKEN_STMNT_END);
- break;
- case SYNTAX_SET:
- return lexer_next_token(lexer, TOKEN_DEF_SET);
- break;
- case SYNTAX_LGROUP:
- return lexer_next_token(lexer, TOKEN_LGROUP);
- break;
- case SYNTAX_RGROUP:
- return lexer_next_token(lexer, TOKEN_RGROUP);
- break;
- case SYNTAX_APPLY:
- return lexer_next_token(lexer, TOKEN_FN_APPLY);
- break;
- case SYNTAX_LIST_DELIM:
- return lexer_next_token(lexer, TOKEN_LIST_DELIM);
- break;
- case SYNTAX_TAG_DELIM:
- return lexer_collect(lexer, token_char_kywrd, 1, 0, TOKEN_DEF_TAG);
- break;
- case SYNTAX_NAMESPACE_DELIM:
- return lexer_next_token(lexer, TOKEN_NAMESPACE_DELIM);
- break;
- case SYNTAX_LBLOCK:
- return lexer_next_token(lexer, TOKEN_BLOCK_START);
- break;
- case SYNTAX_RBLOCK:
- return lexer_next_token(lexer, TOKEN_BLOCK_END);
- break;
- case SYNTAX_LLIST:
- return lexer_next_token(lexer, TOKEN_ARRAY_START);
- break;
- case SYNTAX_RLIST:
- return lexer_next_token(lexer, TOKEN_ARRAY_END);
- break;
- case '\0':
- case EOF:
- return token_init(TOKEN_EOF, lexer_get_c_as_string(lexer));
- break;
- default:
- return lexer_next_token(lexer, TOKEN_UNKNOWN);
- break;
- }
- }
+ log_inf("token/v:%s\t/t:%d", token->val, token->type);
- return NULL;
+ lexer->tokenc ++;
}
-token_t* lexer_next_token(lexer_t* lexer, int token_type) {
- token_t* token = token_init(token_type, lexer_get_c_as_string(lexer));
- lexer_next(lexer);
- return token;
-}
-
-char* lexer_get_c_as_string(lexer_t* lexer) {
- char* str; /* the string to return */
-
- str = malloc(2);
- str[0] = lexer->c;
- str[1] = '\0';
-
- return str;
-}
-
-token_t* lexer_collect(lexer_t* lexer, int (*end_char)(char), int fskip, int lskip, int type) {
- size_t len; /* length of collected token so far */
- char* token; /* collected token so far */
-
- len = 0;
- token = calloc(len, sizeof(char));
-
- if (fskip) { lexer_next(lexer); }
+void lexer_add_current_char(lexer_t* lexer, int type) {
+ char* c; /* get the current character as a string */
+ token_t* t; /* the token to be added */
- while (end_char(lexer->c)) {
- char* current;
+ c = ecalloc(2, sizeof(char));
+ c[0] = *lexer->src;
+ c[1] = '\0';
- current = lexer_get_c_as_string(lexer);
- token = realloc(
- token,
- (len + sizeof(current))
- );
+ t = token_init(type, c);
- memcpy(token + len, current, sizeof(char) * strlen(current));
- len += strlen(current) * sizeof(char);
- lexer_next(lexer);
-
- free(current);
- }
-
- if (lskip) { lexer_next(lexer); }
-
- token[len] = '\0'; /* terminate */
-
- return token_init(type, token);
+ lexer_add_token(lexer, t);
}
-lexer_t* lexer_run(lexer_t* lexer) {
- while (1) {
- token_t* token;
- char* type;
-
- token = lexer_get_next_token(lexer);
- type = token_get_type(token->type);
-
- log_inf("type: %s\t\tval:%s", type, token->value);
-
- if (token->type == TOKEN_EOF) {
- token_destroy(token);
+void lexer_do_reg(lexer_t* lexer) {
+ switch (*lexer->src) {
+ case SYNTAX_APPLY:
+ lexer_add_current_char(lexer, TOKEN_APPLY);
break;
- }
-
- token_destroy(token);
+ default:
+ lexer_add_current_char(lexer, TOKEN_UNKNOWN);
+
}
+}
- return lexer;
+void lexer_run(lexer_t* lexer) {
+ while (*lexer->src) {
+ if (lexer->state == LEXER_STATE_REG) { lexer_do_reg(lexer); }
+ lexer->src ++;
+ }
}
diff --git a/src/main.c b/src/main.c
index c084002..c0a8d43 100644
--- a/src/main.c
+++ b/src/main.c
@@ -9,40 +9,41 @@
#include "include/hlkt.h"
int main(int argc, char* argv[]) {
- char* source;
-
- lexer_t* lexer;
+ char* src; /* the source "code" */
pp_t* pp;
+ lexer_t* lexer;
- source = source_get(argv[1]);
- HLKT_ASS(source);
+ /* get source */
+ src = source_get(argv[1]);
+ HLKT_ASS(src);
log_inf("source gotten");
- log_inf("source: %s", source);
- pp = pp_init(source);
+ /* create pre-processor */
+ pp = pp_init(src);
HLKT_ASS(pp);
log_inf("preprocessor created");
+ /* pre-process source */
pp_run(pp);
- free(source);
- source = pp->psrc;
- pp_destroy(pp);
- HLKT_ASS(source);
+ free(src);
+ src = pp->psrc;
+ HLKT_ASS(src);
log_inf("preprocessor ran");
- log_inf("preprocessed source: %s", source);
- /*
- lexer = lexer_init(source);
+ /* create lexer */
+ lexer = lexer_init(src);
HLKT_ASS(lexer);
log_inf("lexer created");
+ /* run lexer */
lexer_run(lexer);
+ log_inf("lexer ran");
+ /* clean up */
+ pp_destroy(pp);
lexer_destroy(lexer);
- */
- free(source);
- /*free(pp->psrc);*/
-
+ token_destroy(lexer->tokenl);
+ free(src);
HLKT_LOG();
diff --git a/src/token.c b/src/token.c
index 355a6dd..26af598 100644
--- a/src/token.c
+++ b/src/token.c
@@ -3,129 +3,26 @@
#include "include/token.h"
token_t* token_init(int type, char* val) {
- token_t* token = calloc(1, sizeof(struct TOKEN_STRUC));
+ token_t* token;
+
+ token = emalloc(sizeof(struct TOKEN_STRUC));
token->type = type;
- token->value = val;
+ token->val = val;
+ token->nxt = NULL;
return token;
}
-char* token_get_type(int type) {
- switch (type) {
- case TOKEN_KEYWORD:
- return "TOKEN_KEYWORD";
- break;
- case TOKEN_PRIM_STR:
- return "TOKEN_PRIM_STR";
- break;
- case TOKEN_PRIM_INT:
- return "TOKEN_PRIM_INT";
- break;
- case TOKEN_COMM:
- return "TOKEN_COMM";
- break;
- case TOKEN_STMNT_END:
- return "TOKEN_EXPR_END";
- break;
- case TOKEN_LGROUP:
- return "TOKEN_LGROUP";
- break;
- case TOKEN_RGROUP:
- return "TOKEN_RGROUP";
- break;
- case TOKEN_DIRECTIVE:
- return "TOKEN_DIRECTIVE";
- break;
- case TOKEN_FN_APPLY:
- return "TOKEN_FN_APPLY";
- break;
- case TOKEN_LIST_DELIM:
- return "TOKEN_LIST_DELIM";
- break;
- case TOKEN_DEF_TAG:
- return "TOKEN_DEF_TAG";
- break;
- case TOKEN_BLOCK_START:
- return "TOKEN_BLOCK_START";
- break;
- case TOKEN_BLOCK_END:
- return "TOKEN_BLOCK_END";
- break;
- case TOKEN_NAMESPACE_DELIM:
- return "TOKEN_NAMESPACE_DELIM";
- break;
- case TOKEN_ARRAY_START:
- return "TOKEN_ARRAY_START";
- break;
- case TOKEN_ARRAY_END:
- return "TOKEN_ARRAY_END";
- break;
- case TOKEN_DEF_SET:
- return "TOKEN_DEF_SET";
- break;
- case TOKEN_UNKNOWN:
- return "TOKEN_UNKNOWN";
- break;
- case TOKEN_EOF:
- return "TOKEN_EOF";
- break;
- default:
- return "???";
- }
-}
-
-int char_could_start_keyword(char* character) {
- for (int i = 0; i < TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS_LEN; ++ i) {
- if (TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS[i] == *character) {
- return 1;
- }
- }
-
- return 0;
-}
-
-int char_could_split_keyword(char* character) {
- if (char_could_start_keyword(character)) {
- return 1;
- } else {
- for (int i = 0; i < TOKEN_DEFNAME_SPLIT_CHAR_ALLOWED_CHARS_LEN; ++ i) {
- if (TOKEN_DEFNAME_SPLIT_CHAR_ALLOWED_CHARS[i] == *character) {
- return 1;
- }
- }
-
- return 0;
- }
+void token_destroy(token_t* token) {
+ free(token);
}
-int char_could_start_int(char* character) {
- for (int i = 0; i < 10; ++ i) {
- if (TOKEN_CHAR_FIRST_CHAR_INT[i] == *character) {
- return 1;
- }
- }
+token_t* token_last(token_t* token) {
+ token_t* t;
- return 0;
-}
-
-int char_can_ignore(char* character) {
- for (int i = 0; i < TOKEN_CHAR_IGNORE_LEN; ++ i) {
- if (TOKEN_CHAR_IGNORE[i] == *character) {
- return 1;
- }
+ while (t->nxt) {
+ t = t->nxt;
}
- return 0;
-}
-
-
-int token_char_quote(char c) { return (c != '\''); }
-int token_char_grave(char c) { return (c != '`'); }
-int token_char_pound(char c) { return (c != '#'); }
-int token_char_colon(char c) { return (c != ':'); }
-int token_char_kywrd(char c) { return (char_could_split_keyword(&c)); }
-
-void token_destroy(token_t* token) {
- free(token->value);
- free(token);
+ return t;
}
diff --git a/src/util.c b/src/util.c
index cee2beb..a175f6b 100644
--- a/src/util.c
+++ b/src/util.c
@@ -1,5 +1,6 @@
#include "include/util.h"
+
void die(const char* fmt, ...) {
va_list ap;