aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorc+12023-05-18 17:57:56 -0400
committerc+12023-05-18 17:57:56 -0400
commitf6ae20caf8191f78eea90edfb17f316db8f8c6b5 (patch)
tree076bec3d50cb23b11f1240d0926f5f8d42b57cc4
parent9c78e75eb59da9c7fb8fbdd5b536ab1b22727859 (diff)
finished the lexer, for real this time
-rw-r--r--src/include/lexer.h13
-rw-r--r--src/include/token.h5
-rw-r--r--src/lexer.c190
3 files changed, 106 insertions, 102 deletions
diff --git a/src/include/lexer.h b/src/include/lexer.h
index 6d9dff8..7364910 100644
--- a/src/include/lexer.h
+++ b/src/include/lexer.h
@@ -5,6 +5,9 @@
#include "token.h"
+#define LEXER_VALID (lexer->c != '\0' && lexer->i < strlen(lexer->content))
+
+
typedef struct LEXER_STRUC {
char c; // current character in content
unsigned int i; // index of c
@@ -19,8 +22,7 @@ extern void lexer_destroy (lexer_t* lexer);
// advance the lexer
extern void lexer_next (lexer_t* lexer);
extern void lexer_pass (lexer_t* lexer);
-extern int lexer_is_valid (lexer_t* lexer);
-extern token_t* lexer_get_next_token (lexer_t* lexer);
+extern token_t* lexer_get_next_token (lexer_t* lexer); // chars -> tokens
extern token_t* lexer_next_token (lexer_t* lexer, token_t* token);
extern char* lexer_get_c_as_string (lexer_t* lexer);
@@ -29,12 +31,13 @@ extern token_t* lexer_get_array (lexer_t* lexer);
extern token_t* lexer_get_string (lexer_t* lexer);
extern token_t* lexer_get_comment (lexer_t* lexer);
-// def collectors
+// special def collectors
extern token_t* lexer_get_directive (lexer_t* lexer);
extern token_t* lexer_get_keyword (lexer_t* lexer);
-extern token_t* lexer_get_def (lexer_t* lexer);
+extern token_t* lexer_get_def_var (lexer_t* lexer);
// <mutability:?><type:><name>=<value?>
+/*
extern token_t* lexer_get_def (lexer_t* lexer); // get the definition
extern token_t* lexer_get_def_mutability (lexer_t* lexer); // look for mut: or immut: (optional, default immut)
extern token_t* lexer_get_def_type (lexer_t* lexer); // get the def type (required)
@@ -49,6 +52,6 @@ extern token_t* lexer_get_var_value (lexer_t* lexer); // get the var
extern token_t* lexer_get_fn_def (lexer_t* lexer);
extern token_t* lexer_get_fn_args (lexer_t* lexer); // get the function args (optional, default empty)
extern token_t* lexer_get_fn_body (lexer_t* lexer); // get the function body (required)
-
+*/
#endif
diff --git a/src/include/token.h b/src/include/token.h
index 45154f4..f7a166f 100644
--- a/src/include/token.h
+++ b/src/include/token.h
@@ -16,7 +16,8 @@ typedef struct TOKEN_STRUC {
TOKEN_DIRECTIVE, // #DIRECTIVE;
TOKEN_FN_APPLY, // .
TOKEN_LIST_DELIM, // ,
- TOKEN_DEF_ARGS_DELIM, // :
+ TOKEN_DEF_TAGS_DELIM, // :
+ TOKEN_DEF, // def:def
TOKEN_BLOCK_DELIM_START, // {
TOKEN_BLOCK_DELIM_END, // }
TOKEN_NAMESPACE_DELIM, // /
@@ -28,6 +29,8 @@ typedef struct TOKEN_STRUC {
char* value;
} token_t;
+token_t* token_init(int type, char* val);
+
char TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS[] = "abcdefghijklmnopqrstuvwxyz_"; // chars that can begin a var name
int TOKEN_DEFNAME_FIRST_CHAR_ALLOWED_CHARS_LEN = 27; // maximum efficiency!
char TOKEN_DEFNAME_SPLIT_CHAR_ALLOWED_CHARS[] = "1234567890_-"; // chars that can be in the rest of the var name,
diff --git a/src/lexer.c b/src/lexer.c
index f51c1fa..484766f 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -9,11 +9,6 @@
#include "include/token.h"
-// TODO:
-// lexer_valid(lexer) -> bool
-// whether at EOF, already defined just make it cleaner
-
-
lexer_t* lexer_init(char* content) {
lexer_t* lexer = calloc(1, sizeof(struct LEXER_STRUC));
@@ -24,8 +19,13 @@ lexer_t* lexer_init(char* content) {
return lexer;
}
+void lexer_destroy(lexer_t* lexer) {
+ free(lexer->content);
+ free(lexer);
+}
+
void lexer_next(lexer_t* lexer) {
- if (lexer->c != '\0' && lexer->i < strlen(lexer->content)) {
+ if (LEXER_VALID) {
lexer->i += 1;
lexer->c = lexer->content[lexer->i];
}
@@ -38,34 +38,36 @@ void lexer_pass(lexer_t* lexer) {
}
token_t* lexer_get_next_token(lexer_t* lexer) {
- while (lexer->c != '\0' && lexer->i < strlen(lexer->content)) {
+ while (LEXER_VALID) {
+
+ if (char_can_ignore(&lexer->c)) {
+ lexer_pass(lexer);
+ }
- if (char_can_ignore(&lexer->c)) { lexer_pass(lexer); }
- if (char_could_start_keyword(&lexer->c)) { return lexer_get_keyword(lexer); }
+ if (char_could_start_keyword(&lexer->c)) {
+ return lexer_get_keyword(lexer);
+ }
switch (lexer->c) {
- case '\'':
- return lexer_get_string(lexer); break;
- case '[':
- return lexer_get_array(lexer); break;
- case ']': return lexer_next_token(
- lexer,
- token_init(
- TOKEN_ARRAY_DELIM_END,
- lexer_get_c_as_string(lexer)
- )
- ); break;
+ case '\'':
+ return lexer_get_string(lexer);
+ break;
+ case '`':
+ return lexer_get_comment(lexer);
+ break;
+ case ';':
+ return lexer_next_token(
+ lexer,
+ token_init(
+ TOKEN_EXPR_END,
+ lexer_get_c_as_string(lexer)
+ )
+ );
+ break;
case '=':
return lexer_get_def_const(lexer); break;
case '-':
return lexer_get_def_mut(lexer); break;
- case ';': return lexer_next_token(
- lexer,
- token_init(
- TOKEN_EOF,
- lexer_get_c_as_string(lexer)
- )
- ); break;
case '(': return lexer_next_token(
lexer,
token_init(
@@ -127,9 +129,44 @@ token_t* lexer_get_next_token(lexer_t* lexer) {
return NULL;
}
-token_t* lexer_get_string(lexer_t* lexer) {
+token_t* lexer_next_token(lexer_t* lexer, token_t* token) {
lexer_next(lexer);
+
+ return token;
+}
+
+char* lexer_get_c_as_string(lexer_t* lexer) {
+ char* str = calloc(2, 1 * sizeof(char));
+ str[0] = lexer->c;
+ str[1] = '\0';
+ return str;
+}
+
+// TODO: abstract away this kind of thing
+token_t* lexer_get_array(lexer_t* lexer) {
+ lexer_next(lexer); // skip opening [
+ char* array_so_far = calloc(1, sizeof(char));
+ array_so_far[0] = '\0';
+
+ while (lexer->c != ']') {
+ char* current = lexer_get_c_as_string(lexer);
+ array_so_far = realloc(
+ array_so_far,
+ (strlen(array_so_far) + strlen(current) * sizeof(char))
+ );
+
+ strcat(array_so_far, current);
+ lexer_next(lexer);
+ }
+
+ lexer_next(lexer); // skip over closing ]
+
+ return token_init(TOKEN_STR, array_so_far); // return the collected array
+}
+
+token_t* lexer_get_string(lexer_t* lexer) {
+ lexer_next(lexer);
char* str_so_far = calloc(1, sizeof(char));
str_so_far[0] = '\0';
@@ -141,13 +178,12 @@ token_t* lexer_get_string(lexer_t* lexer) {
);
strcat(str_so_far, current);
-
lexer_next(lexer);
}
- lexer_next(lexer); // skip over closing "
+ lexer_next(lexer); // skip over closing '
- return token_init(TOKEN_STR, str_so_far);
+ return token_init(TOKEN_STR, str_so_far); // return the collected string
}
token_t* lexer_get_comment(lexer_t* lexer) {
@@ -155,7 +191,7 @@ token_t* lexer_get_comment(lexer_t* lexer) {
char* comment_so_far = calloc(1, sizeof(char));
- while (lexer->c != ']') {
+ while (lexer->c != '`') {
char* current = lexer_get_c_as_string(lexer);
comment_so_far = realloc(
comment_so_far,
@@ -166,62 +202,28 @@ token_t* lexer_get_comment(lexer_t* lexer) {
lexer_next(lexer);
}
- lexer_next(lexer); // skip over closing ]
+ lexer_next(lexer); // skip over closing `
return token_init(TOKEN_COMM, comment_so_far);
}
-token_t* lexer_get_def_const(lexer_t* lexer) {
- lexer_pass(lexer);
-
- if (lexer_next(lexer), lexer->c == '>') {
- lexer_next(lexer);
- return token_init(TOKEN_DEFINE_CONST, "=>");
- } else {
- log_err("Unknown variable state.");
- exit(1);
- }
-}
-
-token_t* lexer_get_def_var(lexer_t* lexer) {
- lexer_pass(lexer);
-
- if (lexer_next(lexer), lexer->c == '=') {
- return lexer_get_def_const(lexer);
- } else if (lexer_next(lexer), lexer->c == '-') {
- return lexer_get_def_mut(lexer);
- } else {
- log_err("Unknown variable state.");
- exit(1);
- }
-}
-
-token_t* lexer_get_def_mut(lexer_t* lexer) {
- lexer_pass(lexer);
-
- if (lexer_next(lexer), lexer->c == '>') {
- lexer_next(lexer);
- return token_init(TOKEN_DEFINE_MUT, "->");
- } else {
- log_err("Unknown variable state.");
- exit(1);
- }
-}
-
token_t* lexer_get_directive(lexer_t* lexer) {
lexer_next(lexer);
-
char* directive_so_far = calloc(1, sizeof(char));
directive_so_far[0] = '\0';
while (lexer->c != ';') {
char* current = lexer_get_c_as_string(lexer);
- directive_so_far = realloc(directive_so_far, (strlen(directive_so_far) + strlen(current) * sizeof(char)));
+ directive_so_far = realloc(
+ directive_so_far,
+ (strlen(directive_so_far) + strlen(current) * sizeof(char))
+ );
+
strcat(directive_so_far, current);
lexer_next(lexer);
}
- lexer_next(lexer);
+ lexer_next(lexer); // skip over closing ;
return token_init(TOKEN_DIRECTIVE, directive_so_far);
}
@@ -229,37 +231,33 @@ token_t* lexer_get_directive(lexer_t* lexer) {
token_t* lexer_get_keyword(lexer_t* lexer) {
char* keyword_so_far = calloc(1, sizeof(char));
keyword_so_far[0] = '\0';
-
- while (isalnum(lexer->c)) {
+ while (char_could_split_keyword(&lexer->c)) {
char* current = lexer_get_c_as_string(lexer);
- keyword_so_far = realloc(keyword_so_far, (strlen(keyword_so_far) + strlen(current) * sizeof(char)));
+ keyword_so_far = realloc(
+ keyword_so_far,
+ (strlen(keyword_so_far) + strlen(current) * sizeof(char))
+ );
+
strcat(keyword_so_far, current);
lexer_next(lexer);
}
- lexer_next(lexer);
-
return token_init(TOKEN_KEYWORD, keyword_so_far);
}
-token_t* lexer_next_token(lexer_t* lexer, token_t* token) {
- lexer_next(lexer);
-
- return token;
-}
-
-char* lexer_get_c_as_string(lexer_t* lexer) {
- char* str = calloc(2, 1 * sizeof(char));
- str[0] = lexer->c;
- str[1] = '\0';
-
-
- return str;
-}
-
-void lexer_destroy(lexer_t* lexer) {
- free(lexer->content);
- free(lexer);
+token_t* lexer_get_def(lexer_t* lexer) {
+ char* def_so_far = calloc(1, sizeof(char));
+ def_so_far[0] = '\0';
+ while (lexer->c != '=') {
+ char* current = lexer_get_c_as_string(lexer);
+ def_so_far = realloc(
+ def_so_far,
+ (strlen(def_so_far) + strlen(current) * sizeof(char))
+ );
+ strcat(def_so_far, current);
+ lexer_next(lexer);
+ }
+ return token_init(TOKEN_DEF, def_so_far);
}