/* attocc - A minimal C compiler. Copyright (C) 2024 metamuffin This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ #include #include #include #include #include const int NUM_KEYWORDS = 32; static char *KEYWORDS[] = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while"}; static char *SEPERATORS = "()[]{};,:"; const int NUM_OPERATORS = 36; static char *OPERATORS[] = { "sizeof", "<<", ">>", "+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", "|=", "^=", "++", "--", "==", "!=", "<=", ">=", "||", "&&", "+", "-", "*", "/", "%", "<", ">", "~", "&", "|", "^", "!", "=", ".", "?", }; #ifdef DEBUG static char *OPERATOR_NAMES[] = { "OP_SIZEOF", "OP_SHIFT_LEFT", "OP_SHIFT_RIGHT", "OP_ADD_ASSIGN", "OP_SUB_ASSIGN", "OP_MUL_ASSIGN", "OP_DIV_ASSIGN", "OP_MOD_ASSIGN", "OP_SHIFT_LEFT_ASSIGN", "OP_SHIFT_RIGHT_ASSIGN", "OP_BITWISE_AND_ASSIGN", "OP_BITWISE_OR_ASSIGN", "OP_BITWISE_XOR_ASSIGN", "OP_INCREMENT", "OP_DECREMENT", "OP_EQUAL", "OP_NOT_EQUAL", "OP_LESS_EQUAL", "OP_GREATER_EQUAL", "OP_LOGICAL_OR", "OP_LOGICAL_AND", "OP_ADD", "OP_SUB", "OP_MUL_OR_POINTER", "OP_DIV", "OP_MOD", "OP_LESS", "OP_GREATER", "OP_BITWISE_NOT", "OP_BITWISE_AND_OR_REF", "OP_BITWISE_OR", "OP_BITWISE_XOR", "OP_LOGICAL_NOT", "OP_ASSIGN", "OP_MEMBER_ACCESS", "OP_TERNARY", }; static char *KEYWORD_NAMES[] = { "KW_AUTO", "KW_BREAK", "KW_CASE", "KW_CHAR", "KW_CONST", "KW_CONTINUE", "KW_DEFAULT", "KW_DO", "KW_DOUBLE", "KW_ELSE", "KW_ENUM", "KW_EXTERN", "KW_FLOAT", "KW_FOR", "KW_GOTO", "KW_IF", "KW_INT", "KW_LONG", "KW_REGISTER", "KW_RETURN", "KW_SHORT", "KW_SIGNED", "KW_SIZEOF", "KW_STATIC", "KW_STRUCT", "KW_SWITCH", "KW_TYPEDEF", "KW_UNION", "KW_UNSIGNED", "KW_VOID", "KW_VOLATILE", "KW_WHILE", }; static char *SEPERATOR_NAMES[] = { "SEP_LPAREN", "SEP_RPAREN", "SEP_LSQUARE", "SEP_RSQUARE", "SEP_LCURLY", "SEP_RCURLY", "SEP_SEMICOLON", "SEP_COMMA", "SEP_DOT", "SEP_COLOR", }; #endif enum keyword { KW_AUTO, KW_BREAK, KW_CASE, KW_CHAR, KW_CONST, KW_CONTINUE, KW_DEFAULT, KW_DO, KW_DOUBLE, KW_ELSE, KW_ENUM, KW_EXTERN, KW_FLOAT, KW_FOR, KW_GOTO, KW_IF, KW_INT, KW_LONG, KW_REGISTER, KW_RETURN, KW_SHORT, KW_SIGNED, KW_SIZEOF, KW_STATIC, KW_STRUCT, KW_SWITCH, KW_TYPEDEF, KW_UNION, KW_UNSIGNED, KW_VOID, KW_VOLATILE, KW_WHILE, }; enum operator{ OP_SIZEOF, // len=6 OP_SHIFT_LEFT, // len=2 OP_SHIFT_RIGHT, OP_ADD_ASSIGN, OP_SUB_ASSIGN, OP_MUL_ASSIGN, OP_DIV_ASSIGN, OP_MOD_ASSIGN, OP_SHIFT_LEFT_ASSIGN, OP_SHIFT_RIGHT_ASSIGN, OP_BITWISE_AND_ASSIGN, OP_BITWISE_OR_ASSIGN, OP_BITWISE_XOR_ASSIGN, OP_INCREMENT, OP_DECREMENT, OP_EQUAL, OP_NOT_EQUAL, OP_LESS_EQUAL, OP_GREATER_EQUAL, OP_LOGICAL_OR, OP_LOGICAL_AND, OP_ADD, // len=1 OP_SUB, OP_MUL_OR_POINTER, OP_DIV, OP_MOD, OP_LESS, OP_GREATER, OP_BITWISE_NOT, OP_BITWISE_AND_OR_REF, OP_BITWISE_OR, OP_BITWISE_XOR, OP_LOGICAL_NOT, OP_ASSIGN, OP_MEMBER_ACCESS, OP_TERNARY, }; enum seperator { SEP_LPAREN, SEP_RPAREN, SEP_LSQUARE, SEP_RSQUARE, SEP_LCURLY, SEP_RCURLY, SEP_SEMICOLON, SEP_COMMA, SEP_DOT, SEP_COLOR, }; enum token_kind { TOK_IDENTIFIER, TOK_KEYWORD, TOK_CONSTANT, TOK_SEPERATOR, TOK_OPERATOR, TOK_END, }; enum constant_kind { CONST_INT, CONST_STR, CONST_CHAR, }; union token_data { char *identifier; enum keyword keyword; struct { enum constant_kind constant_kind; union { int constant_int_value; char constant_char_value; char *constant_str_value; }; }; enum seperator seperator; enum operator operator; }; struct token { enum token_kind kind; union token_data data; }; char is_numeric(char c) { return c >= '0' && c <= '9'; } char is_octal(char c) { return c >= '0' && c <= '7'; } char is_hexadecial(char c) { return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); } char is_alpha(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } char is_alphanumeric(char c) { return is_alpha(c) || is_numeric(c); } char is_ident(char c) { return is_alphanumeric(c) || c == '_'; } char map_escape(char c) { switch (c) { case 'n': return '\n'; case 't': return '\t'; case 'b': return '\b'; case 'r': return '\r'; case '\'': return '\''; case '"': return '"'; default: return '\0'; } } struct token *tokenize(char *source) { char *end = source + strlen(source); char *p = source; unsigned long num_tokens = 0; struct token *tokens = NULL; while (*p) { unsigned long remaining = end - p; //* whitespace if (p[0] == ' ' || p[0] == '\t' || p[0] == '\n') { p++; continue; } //* comments if (remaining >= 2) { if (p[0] == '/' && p[1] == '/') { p += 2; for (char c; (c = *p++) && c != '\n';) ; continue; } else if (p[0] == '/' && p[1] == '*') { p += 2; for (char c, d = '\0'; (d = c, c = *p++) && !(c == '/' && d == '*');) ; continue; } else if (p[0] == '#') { p += 1; for (char c; (c = *p++) && c != '\n';) ; continue; } } //* seperators char match = 0; for (int i = 0; SEPERATORS[i]; i++) { if (p[0] == SEPERATORS[i]) { p++; num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_SEPERATOR; new_token->data.seperator = i; match = 1; break; } } if (match) continue; //* operators match = 0; for (int i = 0; i < NUM_OPERATORS; i++) { char *op = OPERATORS[i]; if (remaining >= strlen(op)) { if (strncmp(op, p, strlen(op)) == 0) { num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_OPERATOR; new_token->data.operator= i; p += strlen(op); match = 1; break; } } } if (match) continue; //* keyword match = 0; for (int i = 0; i < NUM_KEYWORDS; i++) { char *kw = KEYWORDS[i]; if (remaining >= strlen(kw)) { if (strncmp(kw, p, strlen(kw)) == 0) { num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_KEYWORD; new_token->data.keyword = i; p += strlen(kw); match = 1; break; } } } if (match) continue; //* number constant if (is_numeric(p[0])) { int value = 0; if (remaining >= 2 && p[1] == 'x') { p += 2; for (char c; (c = *p++) && is_hexadecial(c);) { value *= 0x10; value += c <= '9' ? c - '0' : 10 + (c <= 'F' ? c - 'A' : c - 'a'); } } else if (p[0] == '0') { p += 1; for (char c; (c = *p++) && is_octal(c);) { value *= 010; value += c - '0'; } } else { for (char c; (c = *p++) && is_numeric(c);) { value *= 10; value += c - '0'; } } p--; num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_CONSTANT; new_token->data.constant_kind = CONST_INT; new_token->data.constant_int_value = value; continue; } //* string constant if (p[0] == '"') { p++; char *str = NULL; int str_len = 0; for (char c; (c = *p++) && c != '"';) { // TODO escape str_len++; str = realloc(str, str_len); if (!str) { fprintf(stderr, "realloc failed\n"); return NULL; } str[str_len - 1] = c; } num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_IDENTIFIER; new_token->data.constant_kind = CONST_STR; new_token->data.constant_str_value = str; continue; } //* char constant if (p[0] == '\'') { if (!*p++) return NULL; char chr = p[0]; if (p[0] == '\\') { if (!*p++) return NULL; chr = map_escape(p[0]); } if (!*p++) return NULL; if (*p++ != '\'') return fprintf(stderr, "expected '\n"), NULL; num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) return fprintf(stderr, "realloc failed\n"), NULL; struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_IDENTIFIER; new_token->data.constant_kind = CONST_STR; new_token->data.constant_char_value = chr; continue; } //* identifier if (is_ident(p[0]) && !is_numeric(p[0])) { char *ident_start = p; for (char c; (c = *p++) && is_ident(c);) ; p--; int ident_len = p - ident_start - 1; char *ident_str = malloc(ident_len + 1); if (!ident_str) { fprintf(stderr, "malloc failed\n"); return NULL; } for (int i = 0; i < ident_len; i++) ident_str[i] = ident_start[i]; ident_str[ident_len] = '\0'; num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_IDENTIFIER; new_token->data.identifier = ident_str; continue; } fprintf(stderr, "unknown token at %li\n", p - source); return NULL; } num_tokens += 1; tokens = realloc(tokens, sizeof(struct token) * num_tokens); if (!tokens) { fprintf(stderr, "realloc failed\n"); return NULL; } struct token *new_token = &tokens[num_tokens - 1]; new_token->kind = TOK_END; return tokens; } #ifdef DEBUG void debug_tokens(struct token *tokens) { for (int i = 0; tokens[i].kind != TOK_END; i++) { switch (tokens[i].kind) { case TOK_IDENTIFIER: // printf("TOK_IDENTIFIER:%s, ", tokens[i].data.identifier); printf("TOK_IDENTIFIER, "); break; case TOK_KEYWORD: printf("TOK_KEYWORD:%s, ", KEYWORD_NAMES[tokens[i].data.keyword]); break; case TOK_CONSTANT: switch (tokens[i].data.constant_kind) { case CONST_INT: printf("TOK_CONSTANT:CONST_INT:%i, ", tokens[i].data.constant_int_value); break; case CONST_STR: printf("TOK_CONSTANT:CONST_STR:%s, ", tokens[i].data.constant_str_value); break; case CONST_CHAR: printf("TOK_CONSTANT:CONST_CHAR:%c, ", tokens[i].data.constant_char_value); break; } break; case TOK_OPERATOR: printf("TOK_OPERATOR:%s, ", OPERATOR_NAMES[tokens[i].data.operator] ); break; case TOK_SEPERATOR: printf("TOK_SEPERATOR:%s, ", SEPERATOR_NAMES[tokens[i].data.seperator]); break; case TOK_END: break; } } printf("TOK_END\n"); } #endif int main(int argc, char **argv) { if (argc < 3) { fprintf(stderr, "USAGE:\n\tattocc \n"); return 1; } char *input = argv[1]; char *output = argv[2]; int input_fd = open(input, O_RDONLY | O_CLOEXEC); if (input_fd < 0) { perror("cannot open input"); return 1; } int output_fd = open(output, O_WRONLY | O_TRUNC | O_CREAT | O_CLOEXEC, 0640); if (output_fd < 0) { perror("cannot open input"); return 1; } int source_len = 0; char *source = NULL; int size; char buffer[4096]; while ((size = read(input_fd, &buffer, 4096))) { if (size < 0) { perror("cannot read source"); return 1; } source_len += size; source = realloc(source, source_len + 1); if (!source) { fprintf(stderr, "malloc failed\n"); return 1; } for (int i = 0; i < size; i++) source[source_len - size + i] = buffer[i]; } source[source_len] = '\0'; struct token *tokens = tokenize(source); if (!tokens) return 1; #ifdef DEBUG debug_tokens(tokens); #endif free(tokens); return 0; }