convert tokenizer to iterator in preperation for one-pass mode

author: metamuffin <metamuffin@disroot.org> 2025-04-03 18:14:32 +0200
committer: metamuffin <metamuffin@disroot.org> 2025-04-03 18:14:32 +0200
commit: 5ac1cdde1743e818db9ae362376cb52770145973 (patch)
tree: b6f62b02e387386b26fbb6c115dfaf4321de8e0f /attocc.c
parent: e403e5b4b2cd70ddf83fe0f40ea1740967871215 (diff)
download: attocc-5ac1cdde1743e818db9ae362376cb52770145973.tar
attocc-5ac1cdde1743e818db9ae362376cb52770145973.tar.bz2
attocc-5ac1cdde1743e818db9ae362376cb52770145973.tar.zst
1 files changed, 132 insertions, 147 deletions
diff --git a/attocc.c b/attocc.c
index 86fbd3f..785c5d1 100644
--- a/attocc.c
+++ b/attocc.c
@@ -194,6 +194,7 @@ enum token_kind {
   TOK_CONSTANT,
   TOK_SEPERATOR,
   TOK_OPERATOR,
+  TOK_ERROR,
 };
 
 enum constant_kind {
@@ -215,6 +216,7 @@ union token_data {
   };
   enum seperator seperator;
   enum operator operator;
+  char *error_message;
 };
 
 struct token {
@@ -312,34 +314,40 @@ struct error {
   unsigned long position;
 };
 
-unsigned long *build_linemap(char *source) {
+struct linemap {
+  unsigned long *lines;
+};
+
+struct linemap linemap_new(char *source) {
+  struct linemap lm;
   unsigned long i = 0;
-  unsigned long *lines = realloc_failsafe(NULL, sizeof(unsigned long));
+  lm.lines = realloc_failsafe(NULL, sizeof(unsigned long));
   int num_lines = 1;
-  lines[0] = 0;
+  lm.lines[0] = 0;
   while (source[i]) {
     if (source[i] == '\n') {
       num_lines += 1;
-      lines = realloc_failsafe(lines, num_lines * sizeof(unsigned long));
-      lines[num_lines - 1] = i;
+      lm.lines = realloc_failsafe(lm.lines, num_lines * sizeof(unsigned long));
+      lm.lines[num_lines - 1] = i;
     }
     i += 1;
   }
-  lines = realloc_failsafe(lines, (num_lines + 1) * sizeof(unsigned long));
-  lines[num_lines] = 0xffffffff;
-  return lines;
+  lm.lines =
+      realloc_failsafe(lm.lines, (num_lines + 1) * sizeof(unsigned long));
+  lm.lines[num_lines] = 0xffffffff;
+  return lm;
 }
 
-int find_line(unsigned long *linemap, unsigned long position) {
+int linemap_find(struct linemap *lm, unsigned long position) {
   int line = 0;
-  while (linemap[line] < position)
+  while (lm->lines[line] < position)
     line += 1;
   return line;
 }
 
-void print_error(struct error error, char *filename, unsigned long *linemap) {
-  int line = find_line(linemap, error.position);
-  int column = error.position - linemap[line];
+void print_error(struct error error, char *filename, struct linemap *linemap) {
+  int line = linemap_find(linemap, error.position);
+  int column = error.position - linemap->lines[line];
   if (!error.message)
     error.message = "<no message>";
   printf("error: %s\n", error.message);
@@ -352,14 +360,37 @@ struct token *token_push(struct token **tokens, unsigned long *num_tokens) {
   return &(*tokens)[*num_tokens - 1];
 }
 
-struct token *tokenize(char *source) {
-  char *end = source + strlen(source);
-  char *p = source;
-  unsigned long num_tokens = 0;
-  struct token *tokens = NULL;
+struct token_iter {
+  char *end;
+  char *start;
+  char *p;
+};
+
+struct token_iter token_iter_new(char *source) {
+  struct token_iter iter;
+  iter.end = source + strlen(source);
+  iter.start = source;
+  iter.p = source;
+  return iter;
+}
+
+struct token t_error(char *message) {
+  struct token t;
+  t.kind = TOK_ERROR;
+  t.data.error_message = message;
+  return t;
+}
+
+struct token token_iter_next(struct token_iter *iter) {
+  char *p = iter->p;
+  unsigned long remaining = iter->end - p;
+  unsigned long position = p - iter->start;
+
+  struct token tok;
+  tok.kind = TOK_END;
+
   while (*p) {
-    unsigned long remaining = end - p;
-    unsigned long position = p - source;
+    tok.position = position;
 
     //* whitespace
     if (p[0] == ' ' || p[0] == '\t' || p[0] == '\n') {
@@ -392,18 +423,14 @@ struct token *tokenize(char *source) {
     for (int i = 0; SEPERATORS[i]; i++) {
       if (p[0] == SEPERATORS[i]) {
         p++;
-        struct token *new_token = token_push(&tokens, &num_tokens);
-        if (!new_token)
-          return NULL;
-        new_token->kind = TOK_SEPERATOR;
-        new_token->data.seperator = i;
-        new_token->position = position;
+        tok.kind = TOK_SEPERATOR;
+        tok.data.seperator = i;
         match = 1;
         break;
       }
     }
     if (match)
-      continue;
+      break;
 
     //* operators
     match = 0;
@@ -411,12 +438,8 @@ struct token *tokenize(char *source) {
       char *op = OPERATORS[i];
       if (remaining >= strlen(op)) {
         if (strncmp(op, p, strlen(op)) == 0) {
-          struct token *new_token = token_push(&tokens, &num_tokens);
-          if (!new_token)
-            return NULL;
-          new_token->kind = TOK_OPERATOR;
-          new_token->data.operator= i;
-          new_token->position = position;
+          tok.kind = TOK_OPERATOR;
+          tok.data.operator= i;
 
           p += strlen(op);
           match = 1;
@@ -425,7 +448,7 @@ struct token *tokenize(char *source) {
       }
     }
     if (match)
-      continue;
+      break;
 
     //* keyword
     match = 0;
@@ -433,13 +456,9 @@ struct token *tokenize(char *source) {
       char *kw = KEYWORDS[i];
       if (remaining >= strlen(kw) + 1) {
         if (strncmp(kw, p, strlen(kw)) == 0 && !is_ident(p[strlen(kw)])) {
-          struct token *new_token = token_push(&tokens, &num_tokens);
-          if (!new_token)
-            return NULL;
-          new_token->kind = TOK_KEYWORD;
-          new_token->data.keyword = i;
-          new_token->position = position;
 
+          tok.kind = TOK_KEYWORD;
+          tok.data.keyword = i;
           p += strlen(kw);
           match = 1;
           break;
@@ -447,7 +466,7 @@ struct token *tokenize(char *source) {
       }
     }
     if (match)
-      continue;
+      break;
 
     //* number constant
     if (is_numeric(p[0])) {
@@ -472,14 +491,10 @@ struct token *tokenize(char *source) {
       }
       p--;
 
-      struct token *new_token = token_push(&tokens, &num_tokens);
-      if (!new_token)
-        return NULL;
-      new_token->kind = TOK_CONSTANT;
-      new_token->data.constant_kind = CONST_INT;
-      new_token->data.constant_int_value = value;
-      new_token->position = position;
-      continue;
+      tok.kind = TOK_CONSTANT;
+      tok.data.constant_kind = CONST_INT;
+      tok.data.constant_int_value = value;
+      break;
     }
 
     //* string constant
@@ -498,35 +513,31 @@ struct token *tokenize(char *source) {
       str = realloc_failsafe(str, str_len);
       str[str_len - 1] = '\0';
 
-      struct token *new_token = token_push(&tokens, &num_tokens);
-      new_token->kind = TOK_CONSTANT;
-      new_token->data.constant_kind = CONST_STR;
-      new_token->data.constant_str_value = str;
-      new_token->position = position;
-      continue;
+      tok.kind = TOK_CONSTANT;
+      tok.data.constant_kind = CONST_STR;
+      tok.data.constant_str_value = str;
+      break;
     }
 
     //* char constant
     if (p[0] == '\'') {
       if (!*p++)
-        return NULL;
+        return t_error("eof");
       char chr = p[0];
       if (p[0] == '\\') {
         if (!*p++)
-          return NULL;
+          return t_error("eof");
         chr = map_escape(p[0]);
       }
       if (!*p++)
-        return NULL;
+        return t_error("eof");
       if (*p++ != '\'')
-        return fprintf(stderr, "expected '\n"), NULL;
+        return t_error("expected '\n");
 
-      struct token *new_token = token_push(&tokens, &num_tokens);
-      new_token->kind = TOK_CONSTANT;
-      new_token->data.constant_kind = CONST_CHAR;
-      new_token->data.constant_char_value = chr;
-      new_token->position = position;
-      continue;
+      tok.kind = TOK_CONSTANT;
+      tok.data.constant_kind = CONST_CHAR;
+      tok.data.constant_char_value = chr;
+      break;
     }
 
     //* identifier
@@ -541,83 +552,69 @@ struct token *tokenize(char *source) {
         ident_str[i] = ident_start[i];
       ident_str[ident_len] = '\0';
 
-      struct token *new_token = token_push(&tokens, &num_tokens);
-      new_token->kind = TOK_IDENTIFIER;
-      new_token->data.identifier = ident_str;
-      new_token->position = position;
-      continue;
+      tok.kind = TOK_IDENTIFIER;
+      tok.data.identifier = ident_str;
+      break;
     }
 
-    fprintf(stderr, "unknown token at %li\n", p - source);
-    return NULL;
+    return t_error("unknown token");
   }
 
-  struct token *new_token = token_push(&tokens, &num_tokens);
-  new_token->kind = TOK_END;
-  new_token->position = 0;
-  return tokens;
+  iter->p = p;
+  return tok;
 }
 
-void free_tokens(struct token *tokens) {
-  for (int i = 0; tokens[i].kind != TOK_END; i++) {
-    switch (tokens[i].kind) {
-    case TOK_IDENTIFIER:
-      free(tokens[i].data.identifier);
+void token_free(struct token tok) {
+  switch (tok.kind) {
+  case TOK_IDENTIFIER:
+    free(tok.data.identifier);
+    break;
+  case TOK_CONSTANT:
+    switch (tok.data.constant_kind) {
+    case CONST_STR:
+      free(tok.data.constant_str_value);
       break;
-    case TOK_CONSTANT:
-      switch (tokens[i].data.constant_kind) {
-      case CONST_STR:
-        free(tokens[i].data.constant_str_value);
-        break;
-      default:
-        break;
-      }
     default:
       break;
     }
+  default:
+    break;
   }
-  free(tokens);
 }
 
 #ifdef DEBUG
-void debug_tokens(struct token *tokens) {
-  for (int i = 0; tokens[i].kind != TOK_END; i++) {
-    printf("%4lu ", tokens[i].position);
-    switch (tokens[i].kind) {
-    case TOK_IDENTIFIER:
-      printf("TOK_IDENTIFIER:%s, ", tokens[i].data.identifier);
-      break;
-    case TOK_KEYWORD:
-      printf("TOK_KEYWORD:%s, ", KEYWORD_NAMES[tokens[i].data.keyword]);
-      break;
-    case TOK_CONSTANT:
-      switch (tokens[i].data.constant_kind) {
-      case CONST_INT:
-        printf("TOK_CONSTANT:CONST_INT:%i, ",
-               tokens[i].data.constant_int_value);
-        break;
-      case CONST_STR:
-        printf("TOK_CONSTANT:CONST_STR:%s, ",
-               tokens[i].data.constant_str_value);
-        break;
-      case CONST_CHAR:
-        printf("TOK_CONSTANT:CONST_CHAR:%c, ",
-               tokens[i].data.constant_char_value);
-        break;
-      }
-      break;
-    case TOK_OPERATOR:
-      printf("TOK_OPERATOR:%s, ", OPERATOR_NAMES[tokens[i].data.operator] );
+void token_print(struct token tok) {
+  printf("%4lu ", tok.position);
+  switch (tok.kind) {
+  case TOK_IDENTIFIER:
+    printf("TOK_IDENTIFIER:%s, ", tok.data.identifier);
+    break;
+  case TOK_KEYWORD:
+    printf("TOK_KEYWORD:%s, ", KEYWORD_NAMES[tok.data.keyword]);
+    break;
+  case TOK_CONSTANT:
+    switch (tok.data.constant_kind) {
+    case CONST_INT:
+      printf("TOK_CONSTANT:CONST_INT:%i, ", tok.data.constant_int_value);
       break;
-    case TOK_SEPERATOR:
-      printf("TOK_SEPERATOR:%s, ", SEPERATOR_NAMES[tokens[i].data.seperator]);
+    case CONST_STR:
+      printf("TOK_CONSTANT:CONST_STR:%s, ", tok.data.constant_str_value);
       break;
-    case TOK_END:
+    case CONST_CHAR:
+      printf("TOK_CONSTANT:CONST_CHAR:%c, ", tok.data.constant_char_value);
       break;
     }
-    printf("\n");
+    break;
+  case TOK_OPERATOR:
+      printf("TOK_OPERATOR:%s, ", OPERATOR_NAMES[tok.data.operator] );
+      break;
+  case TOK_SEPERATOR:
+    printf("TOK_SEPERATOR:%s, ", SEPERATOR_NAMES[tok.data.seperator]);
+    break;
+  case TOK_END:
+    break;
   }
-  printf("TOK_END\n");
+  printf("\n");
 }
 #endif
 
@@ -1016,36 +1013,24 @@ int main(int argc, char **argv) {
   }
   source[source_len] = '\0';
 
-#ifdef DEBUG
-  printf("%i bytes loaded\n", source_len);
-  printf("\n=== TOKENIZE ===\n");
-#endif
+  struct linemap linemap;
+  struct token_iter iter;
+  struct token tok;
 
-  struct token *tokens = tokenize(source);
-  if (!tokens)
-    return 1;
+  linemap = linemap_new(source);
+  iter = token_iter_new(source);
 
-#ifdef DEBUG
-  debug_tokens(tokens);
-  printf("\n=== PARSE ===\n");
-#endif
-
-  int p = 0;
-  struct error error;
-  error.message = NULL;
-  error.position = 0;
-  struct node *node = parse(&p, tokens, &error);
-  if (!node) {
-    unsigned long *linemap = build_linemap(source);
-    print_error(error, input, linemap);
-    return 1;
-  }
+  while (1) {
+    tok = token_iter_next(&iter);
+    if (tok.kind == TOK_END)
+      break;
 
 #ifdef DEBUG
-  debug_node(node, 0);
+    token_print(tok);
 #endif
 
-  free_tokens(tokens);
+    token_free(tok);
+  };
 
   return 0;
 }
author	metamuffin <metamuffin@disroot.org>	2025-04-03 18:14:32 +0200
committer	metamuffin <metamuffin@disroot.org>	2025-04-03 18:14:32 +0200
commit	5ac1cdde1743e818db9ae362376cb52770145973 (patch)
tree	b6f62b02e387386b26fbb6c115dfaf4321de8e0f /attocc.c
parent	e403e5b4b2cd70ddf83fe0f40ea1740967871215 (diff)
download	attocc-5ac1cdde1743e818db9ae362376cb52770145973.tar attocc-5ac1cdde1743e818db9ae362376cb52770145973.tar.bz2 attocc-5ac1cdde1743e818db9ae362376cb52770145973.tar.zst