From 6fc8f91e0d96ae4b4ee59ea562574cc04fdf8abf Mon Sep 17 00:00:00 2001 From: c+1 Date: Sat, 21 Oct 2023 09:10:58 -0400 Subject: ⬣ --- Makefile | 1 + README.md | 35 ++++++++++++++++++------------ res/HALK.png | Bin 0 -> 822 bytes src/include/lexer.h | 11 ++++++++-- src/include/token.h | 7 +++++- src/include/tree.h | 46 ++++++++++++---------------------------- src/lexer.c | 60 +++++++++++++++++++++++++++++++--------------------- src/token.c | 25 ++++++++++++++++++++-- tree.txt | 30 -------------------------- 9 files changed, 110 insertions(+), 105 deletions(-) create mode 100644 res/HALK.png delete mode 100644 tree.txt diff --git a/Makefile b/Makefile index d9602c2..b888564 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,7 @@ DBG_CFLAGS := -Og -ggdb -pedantic -Wall -Wno-deprecated-declarations -fsanitize= CFLAGS := ${REG_CFLAGS} SRCS := $(wildcard src/*.c) SRCS := $(filter-out src/parser.c, $(SRCS)) +SRCS := $(filter-out src/tree.c, $(SRCS)) OBJS := $(SRCS:.c=.o) all: halk diff --git a/README.md b/README.md index 01ad2cb..5a42d7b 100644 --- a/README.md +++ b/README.md @@ -20,24 +20,31 @@ One must simply $ halk examples/simple.halk ``` -. Running *HALK* with no arguments allows one to send arbitrary text. +. Running *HALK* with no arguments allows one to lex(?) arbitrary text through stdin. An example session is displayed below: ```text $ halk -> :str:var = 'Hello, World.'; -> ^D +:str:var = "Hello, World"; +[==] HLKT: test passed: src/main.c/main/18 +[==] source gotten +[==] source: :str:var = "Hello, World"; + +[==] HLKT: test passed: src/main.c/main/24 +[==] preprocessor created +[==] pre-processed source: :str:var="Hello, World"; +[==] HLKT: test passed: src/main.c/main/34 +[==] preprocessor ran +[==] HLKT: test passed: src/main.c/main/39 +[==] HLKT: test passed: src/main.c/main/40 [==] lexer created -[==] BEGIN INPUT -:str:var = 'Hello, World.'; -[==] END INPUT -[==] token type: [TOKEN_DEF_TAG] token value: [str] -[==] token type: [TOKEN_DEF_TAG] token value: [var] -[==] token type: [TOKEN_DEF_SET] token value: [=] -[==] token type: [TOKEN_PRIM_STR] token value: [Hello, World.] -[==] token type: [TOKEN_EXPR_END] token value: [;] -[==] token type: [TOKEN_EOF] token value: [] -[==] source file closed +[==] token/t=9 /v=str +[==] token/t=9 /v=var +[==] token/t=4 /v== +[==] token/t=2 /v=Hello, World +[==] token/t=3 /v=; +[==] lexer ran +[==] HLKT: all 5 tests passed ``` # Syntax @@ -49,7 +56,7 @@ Note that all syntax described is liable to sudden and violent change. Example programs can be found [here](../tree/examples). - [x] Preprocessor -- [ ] Lexer +- [x] Lexer - [ ] Abstract Syntax Tree - [ ] Parser - [ ] Doer diff --git a/res/HALK.png b/res/HALK.png new file mode 100644 index 0000000..a41b12e Binary files /dev/null and b/res/HALK.png differ diff --git a/src/include/lexer.h b/src/include/lexer.h index b2bf9eb..83ace59 100644 --- a/src/include/lexer.h +++ b/src/include/lexer.h @@ -16,8 +16,10 @@ typedef struct LEXER_STRUC { enum LEXER_STATE { /* normal 1-character token */ LEXER_STATE_REG, - /* character */ - LEXER_STATE_CHR, + /* definition tag */ + LEXER_STATE_TAG, + /* escaped character in string */ + LEXER_STATE_ESC, /* string */ LEXER_STATE_STR, /* definition */ @@ -46,8 +48,13 @@ void lexer_add_token(lexer_t* lexer, token_t* token); lexer_do_reg() */ void lexer_add_current_char(lexer_t* lexer, int type); +/* add first character of lexer's src to the value of the last token in tokenl, if it exists. otherwise, create new token and add it */ +void lexer_add_current_char_to_last_token(lexer_t* lexer, int type); + /* handle regular state */ void lexer_do_reg(lexer_t*); +/* handle definition tag state*/ +void lexer_do_tag(lexer_t*); /* handle character state */ void lexer_do_chr(lexer_t*); /* handle string state */ diff --git a/src/include/token.h b/src/include/token.h index 5a3a36c..a186fa9 100644 --- a/src/include/token.h +++ b/src/include/token.h @@ -34,11 +34,16 @@ typedef struct TOKEN_STRUC { } token_t; /* creates a token */ -token_t* token_init(int type, char* val); +token_t* token_init(int type, char val); /* destroys a token **and all tokens contained in nxt** **Make sure to set the nxt of any parent tokens to NULL** */ void token_destroy(token_t* token); /* return pointer to the last token */ token_t* token_last(token_t* token); +/* add a character to the token value */ +void token_add_char(token_t*, char); + +/* print a token -- for debugging purposes */ +void token_print(token_t* token); #endif diff --git a/src/include/tree.h b/src/include/tree.h index a2b71da..88287a4 100644 --- a/src/include/tree.h +++ b/src/include/tree.h @@ -4,44 +4,26 @@ #include typedef struct TREE_STRUC { - enum { - TREE_COMP, - TREE_DEF, - TREE_CALL, - TREE_TYPE_STR, + enum TREE_TYPE { TREE_TYPE_INT, + TREE_TYPE_STR, + TREE_TYPE_DEF, + TREE_TYPE_CAL, + TREE_TYPE_COND, } type; union { - struct { // === "COMPOUND" === - struct TREE_STRUC** value; - size_t size; - } comp; - - struct { // === DEFINITIONS === - char* type; // the definition type - char** tags; // the definition tags - size_t tags_size; // the number of tags - char* name; // the definition name - struct TREE_STRUC** args; // the arguments the definition will accept - size_t args_size; // the number of arguments - struct TREE_STRUC* value; // value of definition - } def; - - struct { // === CALLS === - char* target; // name of definition being called - struct TREE_STRUC** args; // arguments passed to definition - size_t args_size; // the number of arguments - } call; + struct { + int val; + } tree_int_t; - // === TYPES === - struct { // strings - char* value; - } type_str; + struct { + char* val; + } tree_str_t; - struct { // integers - int value; - } type_int; + struct { + char* id; + } tree_def_t; } data; } tree_t; diff --git a/src/lexer.c b/src/lexer.c index ba0e8e1..7f36b98 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -36,31 +36,33 @@ void lexer_add_token(lexer_t* lexer, token_t* token) { lexer->tokenl_last = token; } - log_inf("token/v:%s\t/t:%d", token->val, token->type); lexer->tokenc ++; } void lexer_add_current_char(lexer_t* lexer, int type) { - char* c; /* get the current character as a string */ token_t* t; /* the token to be added */ - c = ecalloc(2, sizeof(char)); - c[0] = *lexer->src; - c[1] = '\0'; - - t = token_init(type, c); + t = token_init(type, *lexer->src); lexer_add_token(lexer, t); } +void lexer_add_current_char_to_last_token(lexer_t* lexer, int type) { + if (lexer->tokenl_last && lexer->tokenl_last->type == type) { + token_add_char(lexer->tokenl_last, *lexer->src); + } else { + lexer_add_current_char(lexer, type); + } +} + void lexer_do_reg(lexer_t* lexer) { switch (*lexer->src) { case SYNTAX_APPLY: lexer_add_current_char(lexer, TOKEN_APPLY); break; case SYNTAX_TAG_DELIM: - lexer_add_current_char(lexer, TOKEN_TAG_DELIM); + lexer->state = LEXER_STATE_TAG; break; case SYNTAX_NAMESPACE_DELIM: lexer_add_current_char(lexer, TOKEN_NAMESPACE_DELIM); @@ -83,39 +85,49 @@ void lexer_do_reg(lexer_t* lexer) { case SYNTAX_EXPR_END: lexer_add_current_char(lexer, TOKEN_EXPR_END); break; - case SYNTAX_STR_DELIM: - lexer_add_current_char(lexer, TOKEN_STR_DELIM); - break; - case SYNTAX_CHAR_DELIM: - lexer_add_current_char(lexer, TOKEN_CHAR_DELIM); - break; case SYNTAX_LIST_DELIM: lexer_add_current_char(lexer, TOKEN_LIST_DELIM); break; + case SYNTAX_STR_DELIM: + lexer->state = LEXER_STATE_STR; + break; default: lexer_add_current_char(lexer, TOKEN_UNKNOWN); } } -void lexer_do_chr(lexer_t* lexer) { - if (*lexer->src == '\'') { - lexer->state = LEXER_STATE_REG; - } else { - token_t* t; - - t = token_init(TOKEN_CHAR, *lexer->src); - - lexer_add_token(lexer, t); +void lexer_do_tag(lexer_t* lexer) { + switch (*lexer->src) { + case SYNTAX_SET: + lexer_add_current_char(lexer, TOKEN_SET); + lexer->state = LEXER_STATE_REG; + break; + case SYNTAX_APPLY: + lexer_add_current_char(lexer, TOKEN_APPLY); + lexer->state = LEXER_STATE_REG; + break; + case SYNTAX_TAG_DELIM: + lexer_add_token(lexer, token_init(TOKEN_TAG, '\0')); + break; + default: lexer_add_current_char_to_last_token(lexer, TOKEN_TAG); } } void lexer_do_str(lexer_t* lexer) { - + if (*lexer->src == SYNTAX_STR_DELIM) { + lexer->state = LEXER_STATE_REG; + } else { + lexer_add_current_char_to_last_token(lexer, TOKEN_STR); + } } void lexer_run(lexer_t* lexer) { while (*lexer->src) { if (lexer->state == LEXER_STATE_REG) { lexer_do_reg(lexer); } + else if (lexer->state == LEXER_STATE_TAG) { lexer_do_tag(lexer); } + else if (lexer->state == LEXER_STATE_STR) { lexer_do_str(lexer); } lexer->src ++; } + + token_print(lexer->tokenl); } diff --git a/src/token.c b/src/token.c index ece32f4..935f23e 100644 --- a/src/token.c +++ b/src/token.c @@ -2,12 +2,14 @@ #include "include/token.h" -token_t* token_init(int type, char* val) { +token_t* token_init(int type, char val) { token_t* token; token = emalloc(sizeof(struct TOKEN_STRUC)); token->type = type; - token->val = val; + token->val = emalloc(2); + *token->val = val; + token->val[1] = '\0'; token->nxt = NULL; return token; @@ -32,3 +34,22 @@ token_t* token_last(token_t* token) { return t; } + +void token_add_char(token_t* token, char c) { + size_t orig; + + orig = strlen(token->val); + + token->val = erealloc(token->val, orig + sizeof c + 1); + token->val[orig] = c; + token->val[orig + 1] = '\0'; +} + +void token_print(token_t* token) { + + log_dbg("token/t=%d\t/v=%s", token->type, token->val); + + if (token->nxt) { + token_print(token->nxt); + } +} diff --git a/tree.txt b/tree.txt deleted file mode 100644 index 39340fe..0000000 --- a/tree.txt +++ /dev/null @@ -1,30 +0,0 @@ -The Expr -======== - [ block ] ⇐ A list of exprs. - │ ┌┘ - │ │ - [ expr ] ── [ lit ] ⇐ A literal value; "base case" for the tree. - │ │ ├── type - ┌──┘ └──┐ └─ value -[ def ] [ call ] - │ │ - ├─ [target] ← id ├─ [target] ← id ⇐ An id is a pointer to another part of the tree. - └── [value] ← expr │ It also contains the flags used in the definition. - └──── [arg] ← expr - -Example Expr Tree -================= -[ block ] - │ - ├─ [ def ] - │ │ - │ ├─ [target] → hello - │ └── [value] → [ lit ] - │ ├── type → str - │ └─ value → Hello, World - ├─ [ call ] - │ │ - │ ├─ [target] → print - │ └──── [arg] → hello - ... - -- cgit v1.2.3