/* attocc - A minimal C compiler. Copyright (C) 2024 metamuffin This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ #include #include #include #include #include const int NUM_KEYWORDS = 32; static char *KEYWORDS[] = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while"}; static char *SEPERATORS = "()[]{};,:"; const int NUM_OPERATORS = 36; static char *OPERATORS[] = { "sizeof", "<<", ">>", "+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", "|=", "^=", "++", "--", "==", "!=", "<=", ">=", "||", "&&", "+", "-", "*", "/", "%", "<", ">", "~", "&", "|", "^", "!", "&", "*", "=", }; static char *OPERATOR_NAMES[] = { "OP_SIZEOF", "OP_SHIFT_LEFT", "OP_SHIFT_RIGHT", "OP_ADD_ASSIGN", "OP_SUB_ASSIGN", "OP_MUL_ASSIGN", "OP_DIV_ASSIGN", "OP_MOD_ASSIGN", "OP_SHIFT_LEFT_ASSIGN", "OP_SHIFT_RIGHT_ASSIGN", "OP_BITWISE_AND_ASSIGN", "OP_BITWISE_OR_ASSIGN", "OP_BITWISE_XOR_ASSIGN", "OP_INCREMENT", "OP_DECREMENT", "OP_EQUAL", "OP_NOT_EQUAL", "OP_LESS_EQUAL", "OP_GREATER_EQUAL", "OP_LOGICAL_OR", "OP_LOGICAL_AND", "OP_ADD", "OP_SUB", "OP_MUL", "OP_DIV", "OP_MOD", "OP_LESS", "OP_GREATER", "OP_BITWISE_NOT", "OP_BITWISE_AND", "OP_BITWISE_OR", "OP_BITWISE_XOR", "OP_LOGICAL_NOT", "OP_POINTER_REF", "OP_POINTER_DEREF", "OP_ASSIGN", }; static char *KEYWORD_NAMES[] = { "KW_AUTO", "KW_BREAK", "KW_CASE", "KW_CHAR", "KW_CONST", "KW_CONTINUE", "KW_DEFAULT", "KW_DO", "KW_DOUBLE", "KW_ELSE", "KW_ENUM", "KW_EXTERN", "KW_FLOAT", "KW_FOR", "KW_GOTO", "KW_IF", "KW_INT", "KW_LONG", "KW_REGISTER", "KW_RETURN", "KW_SHORT", "KW_SIGNED", "KW_SIZEOF", "KW_STATIC", "KW_STRUCT", "KW_SWITCH", "KW_TYPEDEF", "KW_UNION", "KW_UNSIGNED", "KW_VOID", "KW_VOLATILE", "KW_WHILE", }; static char *SEPERATOR_NAMES[] = { "SEP_LPAREN", "SEP_RPAREN", "SEP_LSQUARE", "SEP_RSQUARE", "SEP_LCURLY", "SEP_RCURLY", "SEP_SEMICOLON", "SEP_COMMA", "SEP_DOT", "SEP_COLOR", }; enum keyword { KW_AUTO, KW_BREAK, KW_CASE, KW_CHAR, KW_CONST, KW_CONTINUE, KW_DEFAULT, KW_DO, KW_DOUBLE, KW_ELSE, KW_ENUM, KW_EXTERN, KW_FLOAT, KW_FOR, KW_GOTO, KW_IF, KW_INT, KW_LONG, KW_REGISTER, KW_RETURN, KW_SHORT, KW_SIGNED, KW_SIZEOF, KW_STATIC, KW_STRUCT, KW_SWITCH, KW_TYPEDEF, KW_UNION, KW_UNSIGNED, KW_VOID, KW_VOLATILE, KW_WHILE, }; enum operator{ OP_SIZEOF, // len=6 OP_SHIFT_LEFT, // len=2 OP_SHIFT_RIGHT, OP_ADD_ASSIGN, OP_SUB_ASSIGN, OP_MUL_ASSIGN, OP_DIV_ASSIGN, OP_MOD_ASSIGN, OP_SHIFT_LEFT_ASSIGN, OP_SHIFT_RIGHT_ASSIGN, OP_BITWISE_AND_ASSIGN, OP_BITWISE_OR_ASSIGN, OP_BITWISE_XOR_ASSIGN, OP_INCREMENT, OP_DECREMENT, OP_EQUAL, OP_NOT_EQUAL, OP_LESS_EQUAL, OP_GREATER_EQUAL, OP_LOGICAL_OR, OP_LOGICAL_AND, OP_ADD, // len=1 OP_SUB, OP_MUL, OP_DIV, OP_MOD, OP_LESS, OP_GREATER, OP_BITWISE_NOT, OP_BITWISE_AND, OP_BITWISE_OR, OP_BITWISE_XOR, OP_LOGICAL_NOT, OP_POINTER_REF, OP_POINTER_DEREF, OP_ASSIGN, }; enum seperator { SEP_LPAREN, SEP_RPAREN, SEP_LSQUARE, SEP_RSQUARE, SEP_LCURLY, SEP_RCURLY, SEP_SEMICOLON, SEP_COMMA, SEP_DOT, SEP_COLOR, }; enum token_kind { TOK_IDENTIFIER, TOK_KEYWORD, TOK_INT_CONSTANT, TOK_SEPERATOR, TOK_OPERATOR, TOK_END, }; union token_data { char *identifier; enum keyword keyword; int constant_value; enum seperator seperator; enum operator operator; }; struct token { enum token_kind kind; union token_data data; }; char is_numeric(char c) { return c >= '0' && c <= '9'; } char is_octal(char c) { return c >= '0' && c <= '7'; } char is_hexadecial(char c) { return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); } char is_alpha(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } char is_alphanumeric(char c) { return is_alpha(c) || is_numeric(c); } struct token *tokenize(char *source) { char *end = source + strlen(source); char *p = source; unsigned long num_tokens = 0; struct token *tokens = NULL; while (*p) { unsigned long remaining = end - p; //* whitespace if (p[0] == ' ' || p[0] == '\t' || p[0] == '\n') { p++; continue; } //* comments if (remaining >= 2) { if (p[0] == '/' && p[1] == '/') { p += 2; for (char c; (c = *p++) && c != '\n';) ; continue; } else if (p[0] == '/' && p[1] == '*') { p += 2; for (char c, d = '\0'; (d = c, c = *p++) && c == '/' && d == '*';) ; continue; } else if (p[0] == '#') { p += 1; for (char c; (c = *p++) && c != '\n';) ; continue; } } //* seperators char match = 0; for (int i = 0; SEPERATORS[i]; i++) { if (p[0] == SEPERATORS[i]) { p++; num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_SEPERATOR; new_token->data.seperator = i; match = 1; break; } } if (match) continue; //* operators match = 0; for (int i = 0; i < NUM_OPERATORS; i++) { char *op = OPERATORS[i]; if (remaining >= strlen(op)) { if (strncmp(op, p, strlen(op)) == 0) { num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_OPERATOR; new_token->data.operator= i; p += strlen(op); match = 1; break; } } } if (match) continue; //* keyword match = 0; for (int i = 0; i < NUM_KEYWORDS; i++) { char *kw = KEYWORDS[i]; if (remaining >= strlen(kw)) { if (strncmp(kw, p, strlen(kw)) == 0) { num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_KEYWORD; new_token->data.keyword = i; // printf("kw match %i %li\n", i, strlen(kw)); p += strlen(kw); match = 1; break; } } } if (match) continue; if (is_numeric(p[0])) { int value = 0; if (remaining >= 2 && p[1] == 'x') { p += 2; for (char c; (c = *p++) && is_hexadecial(c);) { value *= 0x10; value += c <= '9' ? c - '0' : 10 + (c <= 'F' ? c - 'A' : c - 'a'); } } else if (p[0] == '0') { p += 1; for (char c; (c = *p++) && is_octal(c);) { value *= 010; value += c - '0'; } } else { for (char c; (c = *p++) && is_numeric(c);) { value *= 10; value += c - '0'; } } p--; num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_INT_CONSTANT; new_token->data.constant_value = value; continue; } if (is_alpha(p[0])) { char *ident_start = p; p++; for (char c; (c = *p++) && is_alphanumeric(c);) ; p--; int ident_len = p - ident_start - 1; char *ident_str = malloc(ident_len + 1); if (!ident_str) { fprintf(stderr, "malloc failed\n"); return NULL; } for (int i = 0; i < ident_len; i++) ident_str[i] = ident_start[i]; ident_str[ident_len] = '\0'; num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_IDENTIFIER; new_token->data.identifier = ident_str; continue; } fprintf(stderr, "unknown token at %li\n", p - source); return NULL; } num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_END; return tokens; } void debug_tokens(struct token *tokens) { for (int i = 0; tokens[i].kind != TOK_END; i++) { switch (tokens[i].kind) { case TOK_IDENTIFIER: printf("TOK_IDENTIFIER:%s, ", tokens[i].data.identifier); break; case TOK_KEYWORD: printf("TOK_KEYWORD:%s, ", KEYWORD_NAMES[tokens[i].data.keyword]); break; case TOK_INT_CONSTANT: printf("TOK_CONSTANT:%i, ", tokens[i].data.constant_value); break; case TOK_OPERATOR: printf("TOK_OPERATOR:%s, ", OPERATOR_NAMES[tokens[i].data.operator] ); break; case TOK_SEPERATOR: printf("TOK_SEPERATOR:%s, ", SEPERATOR_NAMES[tokens[i].data.seperator]); break; case TOK_END: break; } } printf("TOK_END\n"); } int main(int argc, char **argv) { if (argc < 3) { fprintf(stderr, "USAGE:\n\tattocc \n"); return 1; } char *input = argv[1]; char *output = argv[2]; int input_fd = open(input, O_RDONLY | O_CLOEXEC); if (input_fd < 0) { perror("cannot open input"); return 1; } int output_fd = open(output, O_WRONLY | O_TRUNC | O_CREAT | O_CLOEXEC, 0640); if (output_fd < 0) { perror("cannot open input"); return 1; } int source_len = 0; char *source = NULL; int size; char buffer[4096]; while ((size = read(input_fd, &buffer, 4096))) { if (size < 0) { perror("cannot read source"); return 1; } source_len += size; source = realloc(source, source_len + 1); if (!source) { fprintf(stderr, "malloc failed\n"); return 1; } for (int i = 0; i < size; i++) source[source_len - size + i] = buffer[i]; } source[source_len] = '\0'; struct token *tokens = tokenize(source); if (!tokens) return 1; debug_tokens(tokens); free(tokens); return 0; }