/* attocc - A minimal C compiler. Copyright (C) 2024 metamuffin This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ #include #include #include #include #include const int NUM_KEYWORDS = 32; static char *KEYWORDS[] = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while"}; static char *SEPERATORS = "()[]{};,:"; const int NUM_OPERATORS = 36; static char *OPERATORS[] = { "sizeof", "<<", ">>", "+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", "|=", "^=", "++", "--", "==", "!=", "<=", ">=", "||", "&&", "+", "-", "*", "/", "%", "<", ">", "~", "&", "|", "^", "!", "=", ".", "?", }; #ifdef DEBUG static char *OPERATOR_NAMES[] = { "OP_SIZEOF", "OP_SHIFT_LEFT", "OP_SHIFT_RIGHT", "OP_ADD_ASSIGN", "OP_SUB_ASSIGN", "OP_MUL_ASSIGN", "OP_DIV_ASSIGN", "OP_MOD_ASSIGN", "OP_SHIFT_LEFT_ASSIGN", "OP_SHIFT_RIGHT_ASSIGN", "OP_BITWISE_AND_ASSIGN", "OP_BITWISE_OR_ASSIGN", "OP_BITWISE_XOR_ASSIGN", "OP_INCREMENT", "OP_DECREMENT", "OP_EQUAL", "OP_NOT_EQUAL", "OP_LESS_EQUAL", "OP_GREATER_EQUAL", "OP_LOGICAL_OR", "OP_LOGICAL_AND", "OP_ADD", "OP_SUB", "OP_MUL_OR_POINTER", "OP_DIV", "OP_MOD", "OP_LESS", "OP_GREATER", "OP_BITWISE_NOT", "OP_BITWISE_AND_OR_REF", "OP_BITWISE_OR", "OP_BITWISE_XOR", "OP_LOGICAL_NOT", "OP_ASSIGN", "OP_MEMBER_ACCESS", "OP_TERNARY", }; static char *KEYWORD_NAMES[] = { "KW_AUTO", "KW_BREAK", "KW_CASE", "KW_CHAR", "KW_CONST", "KW_CONTINUE", "KW_DEFAULT", "KW_DO", "KW_DOUBLE", "KW_ELSE", "KW_ENUM", "KW_EXTERN", "KW_FLOAT", "KW_FOR", "KW_GOTO", "KW_IF", "KW_INT", "KW_LONG", "KW_REGISTER", "KW_RETURN", "KW_SHORT", "KW_SIGNED", "KW_SIZEOF", "KW_STATIC", "KW_STRUCT", "KW_SWITCH", "KW_TYPEDEF", "KW_UNION", "KW_UNSIGNED", "KW_VOID", "KW_VOLATILE", "KW_WHILE", }; static char *SEPERATOR_NAMES[] = { "SEP_LPAREN", "SEP_RPAREN", "SEP_LSQUARE", "SEP_RSQUARE", "SEP_LCURLY", "SEP_RCURLY", "SEP_SEMICOLON", "SEP_COMMA", "SEP_DOT", "SEP_COLOR", }; #endif enum keyword { KW_AUTO, KW_BREAK, KW_CASE, KW_CHAR, KW_CONST, KW_CONTINUE, KW_DEFAULT, KW_DO, KW_DOUBLE, KW_ELSE, KW_ENUM, KW_EXTERN, KW_FLOAT, KW_FOR, KW_GOTO, KW_IF, KW_INT, KW_LONG, KW_REGISTER, KW_RETURN, KW_SHORT, KW_SIGNED, KW_SIZEOF, KW_STATIC, KW_STRUCT, KW_SWITCH, KW_TYPEDEF, KW_UNION, KW_UNSIGNED, KW_VOID, KW_VOLATILE, KW_WHILE, }; enum operator{ OP_SIZEOF, // len=6 OP_SHIFT_LEFT, // len=2 OP_SHIFT_RIGHT, OP_ADD_ASSIGN, OP_SUB_ASSIGN, OP_MUL_ASSIGN, OP_DIV_ASSIGN, OP_MOD_ASSIGN, OP_SHIFT_LEFT_ASSIGN, OP_SHIFT_RIGHT_ASSIGN, OP_BITWISE_AND_ASSIGN, OP_BITWISE_OR_ASSIGN, OP_BITWISE_XOR_ASSIGN, OP_INCREMENT, OP_DECREMENT, OP_EQUAL, OP_NOT_EQUAL, OP_LESS_EQUAL, OP_GREATER_EQUAL, OP_LOGICAL_OR, OP_LOGICAL_AND, OP_ADD, // len=1 OP_SUB, OP_MUL_OR_POINTER, OP_DIV, OP_MOD, OP_LESS, OP_GREATER, OP_BITWISE_NOT, OP_BITWISE_AND_OR_REF, OP_BITWISE_OR, OP_BITWISE_XOR, OP_LOGICAL_NOT, OP_ASSIGN, OP_MEMBER_ACCESS, OP_TERNARY, }; enum seperator { SEP_LPAREN, SEP_RPAREN, SEP_LSQUARE, SEP_RSQUARE, SEP_LCURLY, SEP_RCURLY, SEP_SEMICOLON, SEP_COMMA, SEP_DOT, SEP_COLOR, }; enum token_kind { TOK_END, TOK_IDENTIFIER, TOK_KEYWORD, TOK_CONSTANT, TOK_SEPERATOR, TOK_OPERATOR, }; enum constant_kind { CONST_INT, CONST_STR, CONST_CHAR, }; union token_data { char *identifier; enum keyword keyword; struct { enum constant_kind constant_kind; union { int constant_int_value; char constant_char_value; char *constant_str_value; }; }; enum seperator seperator; enum operator operator; }; struct token { enum token_kind kind; union token_data data; }; char is_numeric(char c) { return c >= '0' && c <= '9'; } char is_octal(char c) { return c >= '0' && c <= '7'; } char is_hexadecial(char c) { return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); } char is_alpha(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } char is_alphanumeric(char c) { return is_alpha(c) || is_numeric(c); } char is_ident(char c) { return is_alphanumeric(c) || c == '_'; } char map_escape(char c) { switch (c) { case 'n': return '\n'; case 't': return '\t'; case 'b': return '\b'; case 'r': return '\r'; case '\'': return '\''; case '"': return '"'; case '0': return '\0'; default: return '\0'; } } void *realloc_failsafe(void *old, unsigned long size) { void *p = realloc(old, size); if (!p) { fprintf(stderr, "malloc failed for size %lu\n", size); exit(2); } return p; } char *strdup_failsafe(char *s) { char *p = strdup(s); if (!p) { fprintf(stderr, "strdup failed\n"); exit(2); } return p; } struct token *token_push(struct token **tokens, unsigned long *num_tokens) { *num_tokens += 1; *tokens = realloc_failsafe(*tokens, sizeof(struct token) * *num_tokens); return &(*tokens)[*num_tokens - 1]; } struct token *tokenize(char *source) { char *end = source + strlen(source); char *p = source; unsigned long num_tokens = 0; struct token *tokens = NULL; while (*p) { unsigned long remaining = end - p; //* whitespace if (p[0] == ' ' || p[0] == '\t' || p[0] == '\n') { p++; continue; } //* comments if (remaining >= 2) { if (p[0] == '/' && p[1] == '/') { p += 2; for (char c; (c = *p++) && c != '\n';) ; continue; } else if (p[0] == '/' && p[1] == '*') { p += 2; for (char c, d = '\0'; (d = c, c = *p++) && !(c == '/' && d == '*');) ; continue; } else if (p[0] == '#') { p += 1; for (char c; (c = *p++) && c != '\n';) ; continue; } } //* seperators char match = 0; for (int i = 0; SEPERATORS[i]; i++) { if (p[0] == SEPERATORS[i]) { p++; struct token *new_token = token_push(&tokens, &num_tokens); if (!new_token) return NULL; new_token->kind = TOK_SEPERATOR; new_token->data.seperator = i; match = 1; break; } } if (match) continue; //* operators match = 0; for (int i = 0; i < NUM_OPERATORS; i++) { char *op = OPERATORS[i]; if (remaining >= strlen(op)) { if (strncmp(op, p, strlen(op)) == 0) { struct token *new_token = token_push(&tokens, &num_tokens); if (!new_token) return NULL; new_token->kind = TOK_OPERATOR; new_token->data.operator= i; p += strlen(op); match = 1; break; } } } if (match) continue; //* keyword match = 0; for (int i = 0; i < NUM_KEYWORDS; i++) { char *kw = KEYWORDS[i]; if (remaining >= strlen(kw) + 1) { if (strncmp(kw, p, strlen(kw)) == 0 && !is_ident(p[strlen(kw)])) { struct token *new_token = token_push(&tokens, &num_tokens); if (!new_token) return NULL; new_token->kind = TOK_KEYWORD; new_token->data.keyword = i; p += strlen(kw); match = 1; break; } } } if (match) continue; //* number constant if (is_numeric(p[0])) { int value = 0; if (remaining >= 2 && p[1] == 'x') { p += 2; for (char c; (c = *p++) && is_hexadecial(c);) { value *= 0x10; value += c <= '9' ? c - '0' : 10 + (c <= 'F' ? c - 'A' : c - 'a'); } } else if (p[0] == '0') { p += 1; for (char c; (c = *p++) && is_octal(c);) { value *= 010; value += c - '0'; } } else { for (char c; (c = *p++) && is_numeric(c);) { value *= 10; value += c - '0'; } } p--; struct token *new_token = token_push(&tokens, &num_tokens); if (!new_token) return NULL; new_token->kind = TOK_CONSTANT; new_token->data.constant_kind = CONST_INT; new_token->data.constant_int_value = value; continue; } //* string constant if (p[0] == '"') { p++; char *str = NULL; int str_len = 0; for (char c; (c = *p++) && c != '"';) { // TODO escape str_len++; str = realloc_failsafe(str, str_len); str[str_len - 1] = c; } // null termination str_len++; str = realloc_failsafe(str, str_len); str[str_len - 1] = '\0'; struct token *new_token = token_push(&tokens, &num_tokens); new_token->kind = TOK_CONSTANT; new_token->data.constant_kind = CONST_STR; new_token->data.constant_str_value = str; continue; } //* char constant if (p[0] == '\'') { if (!*p++) return NULL; char chr = p[0]; if (p[0] == '\\') { if (!*p++) return NULL; chr = map_escape(p[0]); } if (!*p++) return NULL; if (*p++ != '\'') return fprintf(stderr, "expected '\n"), NULL; struct token *new_token = token_push(&tokens, &num_tokens); new_token->kind = TOK_CONSTANT; new_token->data.constant_kind = CONST_CHAR; new_token->data.constant_char_value = chr; continue; } //* identifier if (is_ident(p[0]) && !is_numeric(p[0])) { char *ident_start = p; for (char c; (c = *p++) && is_ident(c);) ; int ident_len = p - ident_start - 1; char *ident_str = realloc_failsafe(NULL, ident_len + 1); for (int i = 0; i < ident_len; i++) ident_str[i] = ident_start[i]; ident_str[ident_len] = '\0'; struct token *new_token = token_push(&tokens, &num_tokens); new_token->kind = TOK_IDENTIFIER; new_token->data.identifier = ident_str; continue; } fprintf(stderr, "unknown token at %li\n", p - source); return NULL; } struct token *new_token = token_push(&tokens, &num_tokens); new_token->kind = TOK_END; return tokens; } void free_tokens(struct token *tokens) { for (int i = 0; tokens[i].kind != TOK_END; i++) { switch (tokens[i].kind) { case TOK_IDENTIFIER: free(tokens[i].data.identifier); break; case TOK_CONSTANT: switch (tokens[i].data.constant_kind) { case CONST_STR: free(tokens[i].data.constant_str_value); break; default: break; } default: break; } } free(tokens); } #ifdef DEBUG void debug_tokens(struct token *tokens) { for (int i = 0; tokens[i].kind != TOK_END; i++) { switch (tokens[i].kind) { case TOK_IDENTIFIER: printf("TOK_IDENTIFIER:%s, ", tokens[i].data.identifier); break; case TOK_KEYWORD: printf("TOK_KEYWORD:%s, ", KEYWORD_NAMES[tokens[i].data.keyword]); break; case TOK_CONSTANT: switch (tokens[i].data.constant_kind) { case CONST_INT: printf("TOK_CONSTANT:CONST_INT:%i, ", tokens[i].data.constant_int_value); break; case CONST_STR: printf("TOK_CONSTANT:CONST_STR:%s, ", tokens[i].data.constant_str_value); break; case CONST_CHAR: printf("TOK_CONSTANT:CONST_CHAR:%c, ", tokens[i].data.constant_char_value); break; } break; case TOK_OPERATOR: printf("TOK_OPERATOR:%s, ", OPERATOR_NAMES[tokens[i].data.operator] ); break; case TOK_SEPERATOR: printf("TOK_SEPERATOR:%s, ", SEPERATOR_NAMES[tokens[i].data.seperator]); break; case TOK_END: break; } printf("\n"); } printf("TOK_END\n"); } #endif struct node { enum node_kind { NO_BINOP, NO_UNOP, NO_MEMBER_ACCESS, NO_STRUCT, NO_UNION, NO_ENUM, NO_FUNCTION, } kind; union node_data { struct node_data_binop { struct node *lhs; struct node *rhs; } binop; struct node_data_enum { char *name; char is_decl; unsigned long num_members; struct enum_member { int value; char *name; } *members; } enum_; } data; }; char parse_enum_decl(struct token *tokens, struct node **node); char parse_type(struct token *tokens, struct node **node) { if (parse_enum_decl(tokens, node)) return 1; if (*node) return 0; return 0; } char parse_enum_decl(struct token *tokens, struct node **node) { int p = 0; printf("1\n"); if (tokens[p].kind != TOK_KEYWORD || tokens[p].data.keyword != KW_ENUM) return 0; p++; printf("2\n"); char *ident = NULL; if (tokens[p].kind == TOK_IDENTIFIER) { ident = tokens[p].data.identifier; p++; } printf("3 %i %i\n", tokens[p].kind, tokens[p].data.seperator); char is_decl = 0; struct enum_member *members = NULL; unsigned long num_members = 0; if (tokens[p].kind == TOK_SEPERATOR && tokens[p].data.seperator == SEP_LCURLY) { p++; is_decl = 1; printf("4\n"); int value = 0; while (1) { if (tokens[p].kind == TOK_SEPERATOR && tokens[p].data.seperator == SEP_RCURLY) break; if (tokens[p].kind == TOK_END) return 1; if (tokens[p].kind != TOK_IDENTIFIER) return 1; char *m_ident = tokens[p].data.identifier; if (tokens[p].kind == TOK_OPERATOR && tokens[p].data.operator== OP_ASSIGN) { p++; if (tokens[p].kind == TOK_END) return 1; if (tokens[p].kind != TOK_CONSTANT || tokens[p].data.constant_kind != CONST_INT) return 1; value = tokens[p].data.constant_int_value; p++; } num_members++; members = realloc_failsafe(NULL, sizeof(struct enum_member) * num_members); struct enum_member *new_member = &members[num_members - 1]; new_member->value = value++; new_member->name = strdup_failsafe(m_ident); p++; } } else if (tokens[p].kind == TOK_SEPERATOR && tokens[p].data.seperator == SEP_SEMICOLON) { } else { printf("expected members or semicolon\n"); return 1; } *node = realloc_failsafe(NULL, sizeof(struct node)); (**node).kind = NO_ENUM; (**node).data.enum_.is_decl = is_decl; (**node).data.enum_.name = strdup_failsafe(ident); (**node).data.enum_.num_members = num_members; (**node).data.enum_.members = members; return 0; } char parse(struct token *tokens, struct node **node) { return parse_type(tokens, node); } #ifdef DEBUG void debug_node(struct node *node) { switch (node->kind) { case NO_ENUM: printf("NO_ENUM\n"); printf(" name=%s\n", node->data.enum_.name); printf(" is_decl=%i\n", node->data.enum_.is_decl); if (node->data.enum_.is_decl) { printf(" num_members=%li\n", node->data.enum_.num_members); printf(" members\n"); for (int i = 0; i < node->data.enum_.num_members; i++) { printf(" %s=%i\n", node->data.enum_.members[i].name, node->data.enum_.members[i].value); } } break; case NO_BINOP: printf("NO_BINOP\n"); printf(" lhs: "); debug_node(node->data.binop.lhs); printf("\n"); printf(" rhs: "); debug_node(node->data.binop.rhs); printf("\n"); break; case NO_MEMBER_ACCESS: printf("NO_MEMBER_ACCESS\n"); break; case NO_FUNCTION: printf("NO_FUNCTION\n"); break; case NO_STRUCT: printf("NO_STRUCT\n"); break; case NO_UNION: printf("NO_UNION\n"); break; case NO_UNOP: printf("NO_UNOP\n"); break; default: break; } } #endif int main(int argc, char **argv) { if (argc < 3) { fprintf(stderr, "USAGE:\n\tattocc \n"); return 1; } char *input = argv[1]; char *output = argv[2]; int input_fd = open(input, O_RDONLY | O_CLOEXEC); if (input_fd < 0) { perror("cannot open input"); return 1; } int output_fd = open(output, O_WRONLY | O_TRUNC | O_CREAT | O_CLOEXEC, 0640); if (output_fd < 0) { perror("cannot open input"); return 1; } int source_len = 0; char *source = NULL; int size; char buffer[4096]; while ((size = read(input_fd, &buffer, 4096))) { if (size < 0) { perror("cannot read source"); return 1; } source_len += size; source = realloc_failsafe(source, source_len + 1); for (int i = 0; i < size; i++) source[source_len - size + i] = buffer[i]; } source[source_len] = '\0'; struct token *tokens = tokenize(source); if (!tokens) return 1; #ifdef DEBUG debug_tokens(tokens); #endif struct node *nodes = NULL; if (parse(tokens, &nodes)) { printf("parse failed\n"); return 1; } if (!nodes) { printf("parse failed"); return 1; } #ifdef DEBUG debug_node(nodes); #endif free_tokens(tokens); return 0; }