diff options
author | metamuffin <metamuffin@disroot.org> | 2025-04-03 18:14:32 +0200 |
---|---|---|
committer | metamuffin <metamuffin@disroot.org> | 2025-04-03 18:14:32 +0200 |
commit | 5ac1cdde1743e818db9ae362376cb52770145973 (patch) | |
tree | b6f62b02e387386b26fbb6c115dfaf4321de8e0f /attocc.c | |
parent | e403e5b4b2cd70ddf83fe0f40ea1740967871215 (diff) | |
download | attocc-5ac1cdde1743e818db9ae362376cb52770145973.tar attocc-5ac1cdde1743e818db9ae362376cb52770145973.tar.bz2 attocc-5ac1cdde1743e818db9ae362376cb52770145973.tar.zst |
convert tokenizer to iterator in preperation for one-pass mode
Diffstat (limited to 'attocc.c')
-rw-r--r-- | attocc.c | 279 |
1 files changed, 132 insertions, 147 deletions
@@ -194,6 +194,7 @@ enum token_kind { TOK_CONSTANT, TOK_SEPERATOR, TOK_OPERATOR, + TOK_ERROR, }; enum constant_kind { @@ -215,6 +216,7 @@ union token_data { }; enum seperator seperator; enum operator operator; + char *error_message; }; struct token { @@ -312,34 +314,40 @@ struct error { unsigned long position; }; -unsigned long *build_linemap(char *source) { +struct linemap { + unsigned long *lines; +}; + +struct linemap linemap_new(char *source) { + struct linemap lm; unsigned long i = 0; - unsigned long *lines = realloc_failsafe(NULL, sizeof(unsigned long)); + lm.lines = realloc_failsafe(NULL, sizeof(unsigned long)); int num_lines = 1; - lines[0] = 0; + lm.lines[0] = 0; while (source[i]) { if (source[i] == '\n') { num_lines += 1; - lines = realloc_failsafe(lines, num_lines * sizeof(unsigned long)); - lines[num_lines - 1] = i; + lm.lines = realloc_failsafe(lm.lines, num_lines * sizeof(unsigned long)); + lm.lines[num_lines - 1] = i; } i += 1; } - lines = realloc_failsafe(lines, (num_lines + 1) * sizeof(unsigned long)); - lines[num_lines] = 0xffffffff; - return lines; + lm.lines = + realloc_failsafe(lm.lines, (num_lines + 1) * sizeof(unsigned long)); + lm.lines[num_lines] = 0xffffffff; + return lm; } -int find_line(unsigned long *linemap, unsigned long position) { +int linemap_find(struct linemap *lm, unsigned long position) { int line = 0; - while (linemap[line] < position) + while (lm->lines[line] < position) line += 1; return line; } -void print_error(struct error error, char *filename, unsigned long *linemap) { - int line = find_line(linemap, error.position); - int column = error.position - linemap[line]; +void print_error(struct error error, char *filename, struct linemap *linemap) { + int line = linemap_find(linemap, error.position); + int column = error.position - linemap->lines[line]; if (!error.message) error.message = "<no message>"; printf("error: %s\n", error.message); @@ -352,14 +360,37 @@ struct token *token_push(struct token **tokens, unsigned long *num_tokens) { return &(*tokens)[*num_tokens - 1]; } -struct token *tokenize(char *source) { - char *end = source + strlen(source); - char *p = source; - unsigned long num_tokens = 0; - struct token *tokens = NULL; +struct token_iter { + char *end; + char *start; + char *p; +}; + +struct token_iter token_iter_new(char *source) { + struct token_iter iter; + iter.end = source + strlen(source); + iter.start = source; + iter.p = source; + return iter; +} + +struct token t_error(char *message) { + struct token t; + t.kind = TOK_ERROR; + t.data.error_message = message; + return t; +} + +struct token token_iter_next(struct token_iter *iter) { + char *p = iter->p; + unsigned long remaining = iter->end - p; + unsigned long position = p - iter->start; + + struct token tok; + tok.kind = TOK_END; + while (*p) { - unsigned long remaining = end - p; - unsigned long position = p - source; + tok.position = position; //* whitespace if (p[0] == ' ' || p[0] == '\t' || p[0] == '\n') { @@ -392,18 +423,14 @@ struct token *tokenize(char *source) { for (int i = 0; SEPERATORS[i]; i++) { if (p[0] == SEPERATORS[i]) { p++; - struct token *new_token = token_push(&tokens, &num_tokens); - if (!new_token) - return NULL; - new_token->kind = TOK_SEPERATOR; - new_token->data.seperator = i; - new_token->position = position; + tok.kind = TOK_SEPERATOR; + tok.data.seperator = i; match = 1; break; } } if (match) - continue; + break; //* operators match = 0; @@ -411,12 +438,8 @@ struct token *tokenize(char *source) { char *op = OPERATORS[i]; if (remaining >= strlen(op)) { if (strncmp(op, p, strlen(op)) == 0) { - struct token *new_token = token_push(&tokens, &num_tokens); - if (!new_token) - return NULL; - new_token->kind = TOK_OPERATOR; - new_token->data.operator= i; - new_token->position = position; + tok.kind = TOK_OPERATOR; + tok.data.operator= i; p += strlen(op); match = 1; @@ -425,7 +448,7 @@ struct token *tokenize(char *source) { } } if (match) - continue; + break; //* keyword match = 0; @@ -433,13 +456,9 @@ struct token *tokenize(char *source) { char *kw = KEYWORDS[i]; if (remaining >= strlen(kw) + 1) { if (strncmp(kw, p, strlen(kw)) == 0 && !is_ident(p[strlen(kw)])) { - struct token *new_token = token_push(&tokens, &num_tokens); - if (!new_token) - return NULL; - new_token->kind = TOK_KEYWORD; - new_token->data.keyword = i; - new_token->position = position; + tok.kind = TOK_KEYWORD; + tok.data.keyword = i; p += strlen(kw); match = 1; break; @@ -447,7 +466,7 @@ struct token *tokenize(char *source) { } } if (match) - continue; + break; //* number constant if (is_numeric(p[0])) { @@ -472,14 +491,10 @@ struct token *tokenize(char *source) { } p--; - struct token *new_token = token_push(&tokens, &num_tokens); - if (!new_token) - return NULL; - new_token->kind = TOK_CONSTANT; - new_token->data.constant_kind = CONST_INT; - new_token->data.constant_int_value = value; - new_token->position = position; - continue; + tok.kind = TOK_CONSTANT; + tok.data.constant_kind = CONST_INT; + tok.data.constant_int_value = value; + break; } //* string constant @@ -498,35 +513,31 @@ struct token *tokenize(char *source) { str = realloc_failsafe(str, str_len); str[str_len - 1] = '\0'; - struct token *new_token = token_push(&tokens, &num_tokens); - new_token->kind = TOK_CONSTANT; - new_token->data.constant_kind = CONST_STR; - new_token->data.constant_str_value = str; - new_token->position = position; - continue; + tok.kind = TOK_CONSTANT; + tok.data.constant_kind = CONST_STR; + tok.data.constant_str_value = str; + break; } //* char constant if (p[0] == '\'') { if (!*p++) - return NULL; + return t_error("eof"); char chr = p[0]; if (p[0] == '\\') { if (!*p++) - return NULL; + return t_error("eof"); chr = map_escape(p[0]); } if (!*p++) - return NULL; + return t_error("eof"); if (*p++ != '\'') - return fprintf(stderr, "expected '\n"), NULL; + return t_error("expected '\n"); - struct token *new_token = token_push(&tokens, &num_tokens); - new_token->kind = TOK_CONSTANT; - new_token->data.constant_kind = CONST_CHAR; - new_token->data.constant_char_value = chr; - new_token->position = position; - continue; + tok.kind = TOK_CONSTANT; + tok.data.constant_kind = CONST_CHAR; + tok.data.constant_char_value = chr; + break; } //* identifier @@ -541,83 +552,69 @@ struct token *tokenize(char *source) { ident_str[i] = ident_start[i]; ident_str[ident_len] = '\0'; - struct token *new_token = token_push(&tokens, &num_tokens); - new_token->kind = TOK_IDENTIFIER; - new_token->data.identifier = ident_str; - new_token->position = position; - continue; + tok.kind = TOK_IDENTIFIER; + tok.data.identifier = ident_str; + break; } - fprintf(stderr, "unknown token at %li\n", p - source); - return NULL; + return t_error("unknown token"); } - struct token *new_token = token_push(&tokens, &num_tokens); - new_token->kind = TOK_END; - new_token->position = 0; - return tokens; + iter->p = p; + return tok; } -void free_tokens(struct token *tokens) { - for (int i = 0; tokens[i].kind != TOK_END; i++) { - switch (tokens[i].kind) { - case TOK_IDENTIFIER: - free(tokens[i].data.identifier); +void token_free(struct token tok) { + switch (tok.kind) { + case TOK_IDENTIFIER: + free(tok.data.identifier); + break; + case TOK_CONSTANT: + switch (tok.data.constant_kind) { + case CONST_STR: + free(tok.data.constant_str_value); break; - case TOK_CONSTANT: - switch (tokens[i].data.constant_kind) { - case CONST_STR: - free(tokens[i].data.constant_str_value); - break; - default: - break; - } default: break; } + default: + break; } - free(tokens); } #ifdef DEBUG -void debug_tokens(struct token *tokens) { - for (int i = 0; tokens[i].kind != TOK_END; i++) { - printf("%4lu ", tokens[i].position); - switch (tokens[i].kind) { - case TOK_IDENTIFIER: - printf("TOK_IDENTIFIER:%s, ", tokens[i].data.identifier); - break; - case TOK_KEYWORD: - printf("TOK_KEYWORD:%s, ", KEYWORD_NAMES[tokens[i].data.keyword]); - break; - case TOK_CONSTANT: - switch (tokens[i].data.constant_kind) { - case CONST_INT: - printf("TOK_CONSTANT:CONST_INT:%i, ", - tokens[i].data.constant_int_value); - break; - case CONST_STR: - printf("TOK_CONSTANT:CONST_STR:%s, ", - tokens[i].data.constant_str_value); - break; - case CONST_CHAR: - printf("TOK_CONSTANT:CONST_CHAR:%c, ", - tokens[i].data.constant_char_value); - break; - } - break; - case TOK_OPERATOR: - printf("TOK_OPERATOR:%s, ", OPERATOR_NAMES[tokens[i].data.operator] ); +void token_print(struct token tok) { + printf("%4lu ", tok.position); + switch (tok.kind) { + case TOK_IDENTIFIER: + printf("TOK_IDENTIFIER:%s, ", tok.data.identifier); + break; + case TOK_KEYWORD: + printf("TOK_KEYWORD:%s, ", KEYWORD_NAMES[tok.data.keyword]); + break; + case TOK_CONSTANT: + switch (tok.data.constant_kind) { + case CONST_INT: + printf("TOK_CONSTANT:CONST_INT:%i, ", tok.data.constant_int_value); break; - case TOK_SEPERATOR: - printf("TOK_SEPERATOR:%s, ", SEPERATOR_NAMES[tokens[i].data.seperator]); + case CONST_STR: + printf("TOK_CONSTANT:CONST_STR:%s, ", tok.data.constant_str_value); break; - case TOK_END: + case CONST_CHAR: + printf("TOK_CONSTANT:CONST_CHAR:%c, ", tok.data.constant_char_value); break; } - printf("\n"); + break; + case TOK_OPERATOR: + printf("TOK_OPERATOR:%s, ", OPERATOR_NAMES[tok.data.operator] ); + break; + case TOK_SEPERATOR: + printf("TOK_SEPERATOR:%s, ", SEPERATOR_NAMES[tok.data.seperator]); + break; + case TOK_END: + break; } - printf("TOK_END\n"); + printf("\n"); } #endif @@ -1016,36 +1013,24 @@ int main(int argc, char **argv) { } source[source_len] = '\0'; -#ifdef DEBUG - printf("%i bytes loaded\n", source_len); - printf("\n=== TOKENIZE ===\n"); -#endif + struct linemap linemap; + struct token_iter iter; + struct token tok; - struct token *tokens = tokenize(source); - if (!tokens) - return 1; + linemap = linemap_new(source); + iter = token_iter_new(source); -#ifdef DEBUG - debug_tokens(tokens); - printf("\n=== PARSE ===\n"); -#endif - - int p = 0; - struct error error; - error.message = NULL; - error.position = 0; - struct node *node = parse(&p, tokens, &error); - if (!node) { - unsigned long *linemap = build_linemap(source); - print_error(error, input, linemap); - return 1; - } + while (1) { + tok = token_iter_next(&iter); + if (tok.kind == TOK_END) + break; #ifdef DEBUG - debug_node(node, 0); + token_print(tok); #endif - free_tokens(tokens); + token_free(tok); + }; return 0; } |