From 6fc8f91e0d96ae4b4ee59ea562574cc04fdf8abf Mon Sep 17 00:00:00 2001
From: c+1
Date: Sat, 21 Oct 2023 09:10:58 -0400
Subject: ⬣

---
 Makefile            |   1 +
 README.md           |  35 ++++++++++++++++++------------
 res/HALK.png        | Bin 0 -> 822 bytes
 src/include/lexer.h |  11 ++++++++--
 src/include/token.h |   7 +++++-
 src/include/tree.h  |  46 ++++++++++++----------------------------
 src/lexer.c         |  60 +++++++++++++++++++++++++++++++---------------------
 src/token.c         |  25 ++++++++++++++++++++--
 tree.txt            |  30 --------------------------
 9 files changed, 110 insertions(+), 105 deletions(-)
 create mode 100644 res/HALK.png
 delete mode 100644 tree.txt

diff --git a/Makefile b/Makefile
index d9602c2..b888564 100644
--- a/Makefile
+++ b/Makefile
@@ -6,6 +6,7 @@ DBG_CFLAGS := -Og -ggdb -pedantic -Wall -Wno-deprecated-declarations -fsanitize=
 CFLAGS     := ${REG_CFLAGS}
 SRCS       := $(wildcard src/*.c)
 SRCS       := $(filter-out src/parser.c, $(SRCS))
+SRCS       := $(filter-out src/tree.c, $(SRCS))
 OBJS       := $(SRCS:.c=.o)
 
 all: halk
diff --git a/README.md b/README.md
index 01ad2cb..5a42d7b 100644
--- a/README.md
+++ b/README.md
@@ -20,24 +20,31 @@ One must simply
 $ halk examples/simple.halk
 ```
 
-. Running *HALK* with no arguments allows one to send arbitrary text.
+. Running *HALK* with no arguments allows one to lex(?) arbitrary text through stdin.
 An example session is displayed below:
 
 ```text
 $ halk
-> :str:var = 'Hello, World.';
-> ^D
+:str:var = "Hello, World";
+[==] HLKT: test passed: src/main.c/main/18
+[==] source gotten
+[==] source: :str:var = "Hello, World";
+
+[==] HLKT: test passed: src/main.c/main/24
+[==] preprocessor created
+[==] pre-processed source: :str:var="Hello, World";
+[==] HLKT: test passed: src/main.c/main/34
+[==] preprocessor ran
+[==] HLKT: test passed: src/main.c/main/39
+[==] HLKT: test passed: src/main.c/main/40
 [==] lexer created
-[==] BEGIN INPUT
-:str:var = 'Hello, World.';
-[==] END INPUT
-[==] token type: [TOKEN_DEF_TAG]    token value: [str]
-[==] token type: [TOKEN_DEF_TAG]    token value: [var]
-[==] token type: [TOKEN_DEF_SET]    token value: [=]
-[==] token type: [TOKEN_PRIM_STR]      token value: [Hello, World.]
-[==] token type: [TOKEN_EXPR_END]      token value: [;]
-[==] token type: [TOKEN_EOF]     token value: []
-[==] source file closed
+[==] token/t=9	/v=str
+[==] token/t=9	/v=var
+[==] token/t=4	/v==
+[==] token/t=2	/v=Hello, World
+[==] token/t=3	/v=;
+[==] lexer ran
+[==] HLKT: all 5 tests passed
 ```
 
 # Syntax
@@ -49,7 +56,7 @@ Note that all syntax described is liable to sudden and violent change.
 Example programs can be found [here](../tree/examples).
 
 - [x] Preprocessor
-- [ ] Lexer
+- [x] Lexer
 - [ ] Abstract Syntax Tree
 - [ ] Parser
 - [ ] Doer
diff --git a/res/HALK.png b/res/HALK.png
new file mode 100644
index 0000000..a41b12e
Binary files /dev/null and b/res/HALK.png differ
diff --git a/src/include/lexer.h b/src/include/lexer.h
index b2bf9eb..83ace59 100644
--- a/src/include/lexer.h
+++ b/src/include/lexer.h
@@ -16,8 +16,10 @@ typedef struct LEXER_STRUC {
    enum LEXER_STATE {
       /* normal 1-character token */
       LEXER_STATE_REG,
-      /* character */
-      LEXER_STATE_CHR,
+      /* definition tag */
+      LEXER_STATE_TAG,
+      /* escaped character in string */
+      LEXER_STATE_ESC,
       /* string */
       LEXER_STATE_STR,
       /* definition */
@@ -46,8 +48,13 @@ void lexer_add_token(lexer_t* lexer, token_t* token);
    lexer_do_reg() */
 void lexer_add_current_char(lexer_t* lexer, int type);
 
+/* add first character of lexer's src to the value of the last token in tokenl, if it exists. otherwise, create new token and add it */
+void lexer_add_current_char_to_last_token(lexer_t* lexer, int type);
+
 /* handle regular state */
 void lexer_do_reg(lexer_t*);
+/* handle definition tag state*/
+void lexer_do_tag(lexer_t*);
 /* handle character state */
 void lexer_do_chr(lexer_t*);
 /* handle string state */
diff --git a/src/include/token.h b/src/include/token.h
index 5a3a36c..a186fa9 100644
--- a/src/include/token.h
+++ b/src/include/token.h
@@ -34,11 +34,16 @@ typedef struct TOKEN_STRUC {
 } token_t;
 
 /* creates a token */
-token_t* token_init(int type, char* val);
+token_t* token_init(int type, char val);
 /* destroys a token **and all tokens contained in nxt** **Make sure to set the nxt of any parent tokens to NULL** */
 void token_destroy(token_t* token);
 
 /* return pointer to the last token */
 token_t* token_last(token_t* token);
+/* add a character to the token value */
+void token_add_char(token_t*, char);
+
+/* print a token -- for debugging purposes */
+void token_print(token_t* token);
 
 #endif
diff --git a/src/include/tree.h b/src/include/tree.h
index a2b71da..88287a4 100644
--- a/src/include/tree.h
+++ b/src/include/tree.h
@@ -4,44 +4,26 @@
 #include <stdlib.h>
 
 typedef struct TREE_STRUC {
-   enum {
-      TREE_COMP,
-      TREE_DEF,
-      TREE_CALL,
-      TREE_TYPE_STR,
+   enum TREE_TYPE {
       TREE_TYPE_INT,
+      TREE_TYPE_STR,
+      TREE_TYPE_DEF,
+      TREE_TYPE_CAL,
+      TREE_TYPE_COND,
    } type;
 
    union {
-      struct {                               // === "COMPOUND" ===
-         struct TREE_STRUC**  value;
-         size_t               size;
-      } comp;
-
-      struct {                               // === DEFINITIONS ===
-         char*                type;          // the definition type
-         char**               tags;          // the definition tags
-         size_t               tags_size;     // the number of tags
-         char*                name;          // the definition name
-         struct TREE_STRUC**  args;          // the arguments the definition will accept
-         size_t               args_size;     // the number of arguments
-         struct TREE_STRUC*   value;         // value of definition
-      } def;
-
-      struct {                               // === CALLS ===
-         char*                target;        // name of definition being called
-         struct TREE_STRUC**  args;          // arguments passed to definition
-         size_t               args_size;     // the number of arguments
-      } call;
+      struct {
+         int val;
+      } tree_int_t;
 
-                                             // === TYPES ===
-      struct {                               // strings
-         char*                value;
-      } type_str; 
+      struct {
+         char* val;
+      } tree_str_t;
 
-      struct {                               // integers
-         int                  value;
-      } type_int;
+      struct {
+         char* id;
+      } tree_def_t;
    } data;
 } tree_t;
 
diff --git a/src/lexer.c b/src/lexer.c
index ba0e8e1..7f36b98 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -36,31 +36,33 @@ void lexer_add_token(lexer_t* lexer, token_t* token) {
       lexer->tokenl_last = token;
    }
 
-   log_inf("token/v:%s\t/t:%d", token->val, token->type);
 
    lexer->tokenc ++;
 }
 
 void lexer_add_current_char(lexer_t* lexer, int type) {
-   char* c;    /* get the current character as a string */
    token_t* t; /* the token to be added */
 
-   c = ecalloc(2, sizeof(char));
-   c[0] = *lexer->src;
-   c[1] = '\0';
-
-   t = token_init(type, c);
+   t = token_init(type, *lexer->src);
 
    lexer_add_token(lexer, t);
 }
 
+void lexer_add_current_char_to_last_token(lexer_t* lexer, int type) {
+   if (lexer->tokenl_last && lexer->tokenl_last->type == type) {
+      token_add_char(lexer->tokenl_last, *lexer->src);
+   } else {
+      lexer_add_current_char(lexer, type);
+   }
+}
+
 void lexer_do_reg(lexer_t* lexer) {
    switch (*lexer->src) {
       case SYNTAX_APPLY:
          lexer_add_current_char(lexer, TOKEN_APPLY);
          break;
       case SYNTAX_TAG_DELIM:
-         lexer_add_current_char(lexer, TOKEN_TAG_DELIM);
+         lexer->state = LEXER_STATE_TAG;
          break;
       case SYNTAX_NAMESPACE_DELIM:
          lexer_add_current_char(lexer, TOKEN_NAMESPACE_DELIM);
@@ -83,39 +85,49 @@ void lexer_do_reg(lexer_t* lexer) {
       case SYNTAX_EXPR_END:
          lexer_add_current_char(lexer, TOKEN_EXPR_END);
          break;
-      case SYNTAX_STR_DELIM:
-         lexer_add_current_char(lexer, TOKEN_STR_DELIM);
-         break;
-      case SYNTAX_CHAR_DELIM:
-         lexer_add_current_char(lexer, TOKEN_CHAR_DELIM);
-         break;
       case SYNTAX_LIST_DELIM:
          lexer_add_current_char(lexer, TOKEN_LIST_DELIM);
          break;
+      case SYNTAX_STR_DELIM:
+         lexer->state = LEXER_STATE_STR;
+         break;
       default:
          lexer_add_current_char(lexer, TOKEN_UNKNOWN);
    }
 }
 
-void lexer_do_chr(lexer_t* lexer) {
-   if (*lexer->src == '\'') {
-      lexer->state = LEXER_STATE_REG;
-   } else {
-      token_t* t;
-
-      t = token_init(TOKEN_CHAR, *lexer->src);
-
-      lexer_add_token(lexer, t);
+void lexer_do_tag(lexer_t* lexer) {
+   switch (*lexer->src) {
+      case SYNTAX_SET:
+         lexer_add_current_char(lexer, TOKEN_SET);
+         lexer->state = LEXER_STATE_REG;
+         break;
+      case SYNTAX_APPLY:
+         lexer_add_current_char(lexer, TOKEN_APPLY);
+         lexer->state = LEXER_STATE_REG;
+         break;
+      case SYNTAX_TAG_DELIM:
+         lexer_add_token(lexer, token_init(TOKEN_TAG, '\0'));
+         break;
+      default: lexer_add_current_char_to_last_token(lexer, TOKEN_TAG);
    }
 }
 
 void lexer_do_str(lexer_t* lexer) {
-
+   if (*lexer->src == SYNTAX_STR_DELIM) {
+      lexer->state = LEXER_STATE_REG;
+   } else {
+      lexer_add_current_char_to_last_token(lexer, TOKEN_STR);
+   }
 }
 
 void lexer_run(lexer_t* lexer) {
    while (*lexer->src) {
       if (lexer->state == LEXER_STATE_REG) { lexer_do_reg(lexer); }
+      else if (lexer->state == LEXER_STATE_TAG) { lexer_do_tag(lexer); }
+      else if (lexer->state == LEXER_STATE_STR) { lexer_do_str(lexer); }
       lexer->src ++;
    }
+
+   token_print(lexer->tokenl);
 }
diff --git a/src/token.c b/src/token.c
index ece32f4..935f23e 100644
--- a/src/token.c
+++ b/src/token.c
@@ -2,12 +2,14 @@
 
 #include "include/token.h"
 
-token_t* token_init(int type, char* val) {
+token_t* token_init(int type, char val) {
    token_t* token;
 
    token = emalloc(sizeof(struct TOKEN_STRUC));
    token->type = type;
-   token->val = val;
+   token->val = emalloc(2);
+   *token->val = val;
+   token->val[1] = '\0';
    token->nxt = NULL;
 
    return token;
@@ -32,3 +34,22 @@ token_t* token_last(token_t* token) {
 
    return t;
 }
+
+void token_add_char(token_t* token, char c) {
+   size_t orig;
+
+   orig = strlen(token->val);
+
+   token->val = erealloc(token->val, orig + sizeof c + 1);
+   token->val[orig] = c;
+   token->val[orig + 1] = '\0';
+}
+
+void token_print(token_t* token) {
+
+   log_dbg("token/t=%d\t/v=%s", token->type, token->val);
+
+   if (token->nxt) {
+      token_print(token->nxt);
+   }
+}
diff --git a/tree.txt b/tree.txt
deleted file mode 100644
index 39340fe..0000000
--- a/tree.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-The Expr
-========
-       [ block ] ⇐ A list of exprs.
-        │    ┌┘
-        │    │
-       [ expr ] ── [ lit ] ⇐ A literal value; "base case" for the tree.
-        │    │      ├── type
-     ┌──┘    └──┐   └─ value
-[ def ]        [ call ]
- │                   │
- ├─ [target] ← id    ├─ [target] ← id ⇐ An id is a pointer to another part of the tree.
- └── [value] ← expr  │                  It also contains the flags used in the definition.
-                     └──── [arg] ← expr
-
-Example Expr Tree
-=================
-[ block ]
- │
- ├─ [ def ]
- │   │
- │   ├─ [target] → hello
- │   └── [value] → [ lit ]
- │                  ├── type → str
- │                  └─ value → Hello, World
- ├─ [ call ]
- │   │
- │   ├─ [target] → print
- │   └──── [arg] → hello
- ...
-
-- 
cgit v1.2.3