diff options
author | metamuffin <metamuffin@disroot.org> | 2024-06-15 01:22:11 +0200 |
---|---|---|
committer | metamuffin <metamuffin@disroot.org> | 2024-06-15 01:22:11 +0200 |
commit | fdedfa4b7e08263ce00e5c0b42b98ab249d5582b (patch) | |
tree | 2999cec0cb50ae883562e909b0b8526d2609f446 | |
parent | b73e0a0a8cf4ad38defd789c137fe74b5dd1d496 (diff) | |
download | attocc-fdedfa4b7e08263ce00e5c0b42b98ab249d5582b.tar attocc-fdedfa4b7e08263ce00e5c0b42b98ab249d5582b.tar.bz2 attocc-fdedfa4b7e08263ce00e5c0b42b98ab249d5582b.tar.zst |
properly parse seps
-rw-r--r-- | attocc.c | 165 |
1 files changed, 129 insertions, 36 deletions
@@ -39,9 +39,60 @@ static char *OPERATORS[] = { ">=", "||", "&&", "+", "-", "*", "/", "%", "<", ">", "~", "&", "|", "^", "!", "&", "*", "=", }; +static char *OPERATOR_NAMES[] = { + "OP_SIZEOF", + "OP_SHIFT_LEFT", + "OP_SHIFT_RIGHT", + "OP_ADD_ASSIGN", + "OP_SUB_ASSIGN", + "OP_MUL_ASSIGN", + "OP_DIV_ASSIGN", + "OP_MOD_ASSIGN", + "OP_SHIFT_LEFT_ASSIGN", + "OP_SHIFT_RIGHT_ASSIGN", + "OP_BITWISE_AND_ASSIGN", + "OP_BITWISE_OR_ASSIGN", + "OP_BITWISE_XOR_ASSIGN", + "OP_INCREMENT", + "OP_DECREMENT", + "OP_EQUAL", + "OP_NOT_EQUAL", + "OP_LESS_EQUAL", + "OP_GREATER_EQUAL", + "OP_LOGICAL_OR", + "OP_LOGICAL_AND", + "OP_ADD", + "OP_SUB", + "OP_MUL", + "OP_DIV", + "OP_MOD", + "OP_LESS", + "OP_GREATER", + "OP_BITWISE_NOT", + "OP_BITWISE_AND", + "OP_BITWISE_OR", + "OP_BITWISE_XOR", + "OP_LOGICAL_NOT", + "OP_POINTER_REF", + "OP_POINTER_DEREF", + "OP_ASSIGN", +}; +static char *KEYWORD_NAMES[] = { + "KW_AUTO", "KW_BREAK", "KW_CASE", "KW_CHAR", "KW_CONST", + "KW_CONTINUE", "KW_DEFAULT", "KW_DO", "KW_DOUBLE", "KW_ELSE", + "KW_ENUM", "KW_EXTERN", "KW_FLOAT", "KW_FOR", "KW_GOTO", + "KW_IF", "KW_INT", "KW_LONG", "KW_REGISTER", "KW_RETURN", + "KW_SHORT", "KW_SIGNED", "KW_SIZEOF", "KW_STATIC", "KW_STRUCT", + "KW_SWITCH", "KW_TYPEDEF", "KW_UNION", "KW_UNSIGNED", "KW_VOID", + "KW_VOLATILE", "KW_WHILE", +}; +static char *SEPERATOR_NAMES[] = { + "SEP_LPAREN", "SEP_RPAREN", "SEP_LSQUARE", "SEP_RSQUARE", "SEP_LCURLY", + "SEP_RCURLY", "SEP_SEMICOLON", "SEP_COMMA", "SEP_DOT", "SEP_COLOR", +}; enum keyword { - kW_AUTO, + KW_AUTO, KW_BREAK, KW_CASE, KW_CHAR, @@ -130,7 +181,7 @@ enum seperator { enum token_kind { TOK_IDENTIFIER, TOK_KEYWORD, - TOK_CONSTANT, + TOK_INT_CONSTANT, TOK_SEPERATOR, TOK_OPERATOR, TOK_END, @@ -149,7 +200,12 @@ struct token { union token_data data; }; -char is_numeric(char c) { return (c >= '0' && c <= '9'); } +char is_numeric(char c) { return c >= '0' && c <= '9'; } +char is_octal(char c) { return c >= '0' && c <= '7'; } +char is_hexadecial(char c) { + return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || + (c >= 'a' && c <= 'f'); +} char is_alpha(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } @@ -162,7 +218,6 @@ struct token *tokenize(char *source) { struct token *tokens = NULL; while (*p) { unsigned long remaining = end - p; - // printf("tok %c\n", p[0]); //* whitespace if (p[0] == ' ' || p[0] == '\t' || p[0] == '\n') { @@ -170,9 +225,31 @@ struct token *tokenize(char *source) { continue; } + //* comments + if (remaining >= 2) { + if (p[0] == '/' && p[1] == '/') { + p += 2; + for (char c; (c = *p++) && c != '\n';) + ; + continue; + } else if (p[0] == '/' && p[1] == '*') { + p += 2; + for (char c, d = '\0'; (d = c, c = *p++) && c == '/' && d == '*';) + ; + continue; + } else if (p[0] == '#') { + p += 1; + for (char c; (c = *p++) && c != '\n';) + ; + continue; + } + } + //* seperators + char match = 0; for (int i = 0; SEPERATORS[i]; i++) { if (p[0] == SEPERATORS[i]) { + p++; num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { @@ -182,11 +259,15 @@ struct token *tokenize(char *source) { struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_SEPERATOR; new_token->data.seperator = i; + match = 1; + break; } } + if (match) + continue; //* operators - char match = 0; + match = 0; for (int i = 0; i < NUM_OPERATORS; i++) { char *op = OPERATORS[i]; if (remaining >= strlen(op)) { @@ -207,29 +288,8 @@ struct token *tokenize(char *source) { } } } - if (match) { + if (match) continue; - } - - //* comments - if (remaining >= 2) { - if (p[0] == '/' && p[1] == '/') { - p += 2; - for (char c; (c = *p++) && c != '\n';) - ; - continue; - } else if (p[0] == '/' && p[1] == '*') { - p += 2; - for (char c, d = '\0'; (d = c, c = *p++) && c == '/' && d == '*';) - ; - continue; - } else if (p[0] == '#') { - p += 1; - for (char c; (c = *p++) && c != '\n';) - ; - continue; - } - } //* keyword match = 0; @@ -254,7 +314,40 @@ struct token *tokenize(char *source) { } } } - if (match) { + if (match) + continue; + + if (is_numeric(p[0])) { + int value = 0; + if (remaining >= 2 && p[1] == 'x') { + p += 2; + for (char c; (c = *p++) && is_hexadecial(c);) { + value *= 0x10; + value += c <= '9' ? c - '0' : 10 + (c <= 'F' ? c - 'A' : c - 'a'); + } + } else if (p[0] == '0') { + p += 1; + for (char c; (c = *p++) && is_octal(c);) { + value *= 010; + value += c - '0'; + } + } else { + for (char c; (c = *p++) && is_numeric(c);) { + value *= 10; + value += c - '0'; + } + } + p--; + + num_tokens += 1; + tokens = realloc(tokens, sizeof(struct token) * num_tokens); + if (!tokens) { + fprintf(stderr, "realloc failed\n"); + return NULL; + } + struct token *new_token = &tokens[num_tokens - 1]; + new_token->kind = TOK_INT_CONSTANT; + new_token->data.constant_value = value; continue; } @@ -263,6 +356,7 @@ struct token *tokenize(char *source) { p++; for (char c; (c = *p++) && is_alphanumeric(c);) ; + p--; int ident_len = p - ident_start - 1; char *ident_str = malloc(ident_len + 1); if (!ident_str) { @@ -270,7 +364,7 @@ struct token *tokenize(char *source) { return NULL; } for (int i = 0; i < ident_len; i++) - ident_str[i] = p[i]; + ident_str[i] = ident_start[i]; ident_str[ident_len] = '\0'; num_tokens += 1; @@ -287,7 +381,6 @@ struct token *tokenize(char *source) { } fprintf(stderr, "unknown token at %li\n", p - source); - printf("%s", p); return NULL; } @@ -307,19 +400,19 @@ void debug_tokens(struct token *tokens) { for (int i = 0; tokens[i].kind != TOK_END; i++) { switch (tokens[i].kind) { case TOK_IDENTIFIER: - printf("TOK_IDENTIFIER, "); + printf("TOK_IDENTIFIER:%s, ", tokens[i].data.identifier); break; case TOK_KEYWORD: - printf("TOK_KEYWORD, "); + printf("TOK_KEYWORD:%s, ", KEYWORD_NAMES[tokens[i].data.keyword]); break; - case TOK_CONSTANT: - printf("TOK_CONSTANT, "); + case TOK_INT_CONSTANT: + printf("TOK_CONSTANT:%i, ", tokens[i].data.constant_value); break; case TOK_OPERATOR: - printf("TOK_OPERATOR, "); + printf("TOK_OPERATOR:%s, ", OPERATOR_NAMES[tokens[i].data.operator] ); break; case TOK_SEPERATOR: - printf("TOK_SEPERATOR, "); + printf("TOK_SEPERATOR:%s, ", SEPERATOR_NAMES[tokens[i].data.seperator]); break; case TOK_END: break; |