/*
attocc - A minimal C compiler.
Copyright (C) 2024 metamuffin
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#include
#ifdef LINT
#define TEST
#define DEBUG
#endif
#define TRAP kill(getpid(), SIGTRAP);
const int NUM_KEYWORDS = 32;
static char *KEYWORDS[] = {
"auto", "break", "case", "char", "const", "continue",
"default", "do", "double", "else", "enum", "extern",
"float", "for", "goto", "if", "int", "long",
"register", "return", "short", "signed", "sizeof", "static",
"struct", "switch", "typedef", "union", "unsigned", "void",
"volatile", "while"};
static char *SEPERATORS = "()[]{};,:";
const int NUM_OPERATORS = 36;
static char *OPERATORS[] = {
"sizeof", "<<", ">>", "+=", "-=", "*=", "/=", "%=", "<<=",
">>=", "&=", "|=", "^=", "++", "--", "==", "!=", "<=",
">=", "||", "&&", "+", "-", "*", "/", "%", "<",
">", "~", "&", "|", "^", "!", "=", ".", "?",
};
#ifdef DEBUG
static char *OPERATOR_NAMES[] = {
"OP_SIZEOF",
"OP_SHIFT_LEFT",
"OP_SHIFT_RIGHT",
"OP_ADD_ASSIGN",
"OP_SUB_ASSIGN",
"OP_MUL_ASSIGN",
"OP_DIV_ASSIGN",
"OP_MOD_ASSIGN",
"OP_SHIFT_LEFT_ASSIGN",
"OP_SHIFT_RIGHT_ASSIGN",
"OP_BITWISE_AND_ASSIGN",
"OP_BITWISE_OR_ASSIGN",
"OP_BITWISE_XOR_ASSIGN",
"OP_INCREMENT",
"OP_DECREMENT",
"OP_EQUAL",
"OP_NOT_EQUAL",
"OP_LESS_EQUAL",
"OP_GREATER_EQUAL",
"OP_LOGICAL_OR",
"OP_LOGICAL_AND",
"OP_ADD",
"OP_SUB",
"OP_MUL_OR_POINTER",
"OP_DIV",
"OP_MOD",
"OP_LESS",
"OP_GREATER",
"OP_BITWISE_NOT",
"OP_BITWISE_AND_OR_REF",
"OP_BITWISE_OR",
"OP_BITWISE_XOR",
"OP_LOGICAL_NOT",
"OP_ASSIGN",
"OP_MEMBER_ACCESS",
"OP_TERNARY",
};
static char *KEYWORD_NAMES[] = {
"KW_AUTO", "KW_BREAK", "KW_CASE", "KW_CHAR", "KW_CONST",
"KW_CONTINUE", "KW_DEFAULT", "KW_DO", "KW_DOUBLE", "KW_ELSE",
"KW_ENUM", "KW_EXTERN", "KW_FLOAT", "KW_FOR", "KW_GOTO",
"KW_IF", "KW_INT", "KW_LONG", "KW_REGISTER", "KW_RETURN",
"KW_SHORT", "KW_SIGNED", "KW_SIZEOF", "KW_STATIC", "KW_STRUCT",
"KW_SWITCH", "KW_TYPEDEF", "KW_UNION", "KW_UNSIGNED", "KW_VOID",
"KW_VOLATILE", "KW_WHILE",
};
static char *SEPERATOR_NAMES[] = {
"SEP_LPAREN", "SEP_RPAREN", "SEP_LSQUARE", "SEP_RSQUARE", "SEP_LCURLY",
"SEP_RCURLY", "SEP_SEMICOLON", "SEP_COMMA", "SEP_DOT", "SEP_COLOR",
};
#endif
enum keyword {
KW_AUTO,
KW_BREAK,
KW_CASE,
KW_CHAR,
KW_CONST,
KW_CONTINUE,
KW_DEFAULT,
KW_DO,
KW_DOUBLE,
KW_ELSE,
KW_ENUM,
KW_EXTERN,
KW_FLOAT,
KW_FOR,
KW_GOTO,
KW_IF,
KW_INT,
KW_LONG,
KW_REGISTER,
KW_RETURN,
KW_SHORT,
KW_SIGNED,
KW_SIZEOF,
KW_STATIC,
KW_STRUCT,
KW_SWITCH,
KW_TYPEDEF,
KW_UNION,
KW_UNSIGNED,
KW_VOID,
KW_VOLATILE,
KW_WHILE,
};
enum operator{
OP_SIZEOF, // len=6
OP_SHIFT_LEFT, // len=2
OP_SHIFT_RIGHT,
OP_ADD_ASSIGN,
OP_SUB_ASSIGN,
OP_MUL_ASSIGN,
OP_DIV_ASSIGN,
OP_MOD_ASSIGN,
OP_SHIFT_LEFT_ASSIGN,
OP_SHIFT_RIGHT_ASSIGN,
OP_BITWISE_AND_ASSIGN,
OP_BITWISE_OR_ASSIGN,
OP_BITWISE_XOR_ASSIGN,
OP_INCREMENT,
OP_DECREMENT,
OP_EQUAL,
OP_NOT_EQUAL,
OP_LESS_EQUAL,
OP_GREATER_EQUAL,
OP_LOGICAL_OR,
OP_LOGICAL_AND,
OP_ADD, // len=1
OP_SUB,
OP_MUL_OR_POINTER,
OP_DIV,
OP_MOD,
OP_LESS,
OP_GREATER,
OP_BITWISE_NOT,
OP_BITWISE_AND_OR_REF,
OP_BITWISE_OR,
OP_BITWISE_XOR,
OP_LOGICAL_NOT,
OP_ASSIGN,
OP_MEMBER_ACCESS,
OP_TERNARY,
};
enum seperator {
SEP_LPAREN,
SEP_RPAREN,
SEP_LSQUARE,
SEP_RSQUARE,
SEP_LCURLY,
SEP_RCURLY,
SEP_SEMICOLON,
SEP_COMMA,
SEP_DOT,
SEP_COLOR,
};
enum token_kind {
TOK_END,
TOK_IDENTIFIER,
TOK_KEYWORD,
TOK_CONSTANT,
TOK_SEPERATOR,
TOK_OPERATOR,
};
enum constant_kind {
CONST_INT,
CONST_STR,
CONST_CHAR,
};
union token_data {
char *identifier;
enum keyword keyword;
struct {
enum constant_kind constant_kind;
union {
int constant_int_value;
char constant_char_value;
char *constant_str_value;
};
};
enum seperator seperator;
enum operator operator;
};
struct token {
enum token_kind kind;
union token_data data;
};
char is_numeric(char c) { return c >= '0' && c <= '9'; }
char is_octal(char c) { return c >= '0' && c <= '7'; }
char is_hexadecial(char c) {
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f');
}
char is_alpha(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}
char is_alphanumeric(char c) { return is_alpha(c) || is_numeric(c); }
char is_ident(char c) { return is_alphanumeric(c) || c == '_'; }
char map_escape(char c) {
switch (c) {
case 'n':
return '\n';
case 't':
return '\t';
case 'b':
return '\b';
case 'r':
return '\r';
case '\'':
return '\'';
case '"':
return '"';
case '0':
return '\0';
default:
return '\0';
}
}
void *realloc_failsafe(void *old, unsigned long size) {
void *p = realloc(old, size);
if (!p) {
fprintf(stderr, "malloc failed for size %lu\n", size);
exit(2);
}
return p;
}
void *memcpy(void *dest, const void *src, unsigned long size) {
char *dest_ = (char *)dest;
char *src_ = (char *)src;
while (size--)
*dest_++ = *src_++;
return dest;
}
// not to specification but this program only needs to check equality
int strncmp(const char *a, const char *b, unsigned long len) {
while (len--) {
if (*a != *b)
return *a - *b;
a++;
b++;
}
return 0;
}
unsigned long strlen(const char *s) {
int len = 0;
while (*s++)
len++;
return len;
}
char *strdup(const char *s) {
int len = strlen(s);
char *new_s = realloc(NULL, len + 1);
if (!new_s)
return NULL;
memcpy(new_s, s, len + 1);
return new_s;
}
char *strdup_failsafe(char *s) {
char *p = strdup(s);
if (!p) {
fprintf(stderr, "strdup failed\n");
exit(2);
}
return p;
}
struct token *token_push(struct token **tokens, unsigned long *num_tokens) {
*num_tokens += 1;
*tokens = realloc_failsafe(*tokens, sizeof(struct token) * *num_tokens);
return &(*tokens)[*num_tokens - 1];
}
struct token *tokenize(char *source) {
char *end = source + strlen(source);
char *p = source;
unsigned long num_tokens = 0;
struct token *tokens = NULL;
while (*p) {
unsigned long remaining = end - p;
//* whitespace
if (p[0] == ' ' || p[0] == '\t' || p[0] == '\n') {
p++;
continue;
}
//* comments
if (remaining >= 2) {
if (p[0] == '/' && p[1] == '/') {
p += 2;
for (char c; (c = *p++) && c != '\n';)
;
continue;
} else if (p[0] == '/' && p[1] == '*') {
p += 2;
for (char c, d = '\0'; (d = c, c = *p++) && !(c == '/' && d == '*');)
;
continue;
} else if (p[0] == '#') {
p += 1;
for (char c; (c = *p++) && c != '\n';)
;
continue;
}
}
//* seperators
char match = 0;
for (int i = 0; SEPERATORS[i]; i++) {
if (p[0] == SEPERATORS[i]) {
p++;
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_SEPERATOR;
new_token->data.seperator = i;
match = 1;
break;
}
}
if (match)
continue;
//* operators
match = 0;
for (int i = 0; i < NUM_OPERATORS; i++) {
char *op = OPERATORS[i];
if (remaining >= strlen(op)) {
if (strncmp(op, p, strlen(op)) == 0) {
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_OPERATOR;
new_token->data.operator= i;
p += strlen(op);
match = 1;
break;
}
}
}
if (match)
continue;
//* keyword
match = 0;
for (int i = 0; i < NUM_KEYWORDS; i++) {
char *kw = KEYWORDS[i];
if (remaining >= strlen(kw) + 1) {
if (strncmp(kw, p, strlen(kw)) == 0 && !is_ident(p[strlen(kw)])) {
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_KEYWORD;
new_token->data.keyword = i;
p += strlen(kw);
match = 1;
break;
}
}
}
if (match)
continue;
//* number constant
if (is_numeric(p[0])) {
int value = 0;
if (remaining >= 2 && p[1] == 'x') {
p += 2;
for (char c; (c = *p++) && is_hexadecial(c);) {
value *= 0x10;
value += c <= '9' ? c - '0' : 10 + (c <= 'F' ? c - 'A' : c - 'a');
}
} else if (p[0] == '0') {
p += 1;
for (char c; (c = *p++) && is_octal(c);) {
value *= 010;
value += c - '0';
}
} else {
for (char c; (c = *p++) && is_numeric(c);) {
value *= 10;
value += c - '0';
}
}
p--;
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_CONSTANT;
new_token->data.constant_kind = CONST_INT;
new_token->data.constant_int_value = value;
continue;
}
//* string constant
if (p[0] == '"') {
p++;
char *str = NULL;
int str_len = 0;
for (char c; (c = *p++) && c != '"';) {
// TODO escape
str_len++;
str = realloc_failsafe(str, str_len);
str[str_len - 1] = c;
}
// null termination
str_len++;
str = realloc_failsafe(str, str_len);
str[str_len - 1] = '\0';
struct token *new_token = token_push(&tokens, &num_tokens);
new_token->kind = TOK_CONSTANT;
new_token->data.constant_kind = CONST_STR;
new_token->data.constant_str_value = str;
continue;
}
//* char constant
if (p[0] == '\'') {
if (!*p++)
return NULL;
char chr = p[0];
if (p[0] == '\\') {
if (!*p++)
return NULL;
chr = map_escape(p[0]);
}
if (!*p++)
return NULL;
if (*p++ != '\'')
return fprintf(stderr, "expected '\n"), NULL;
struct token *new_token = token_push(&tokens, &num_tokens);
new_token->kind = TOK_CONSTANT;
new_token->data.constant_kind = CONST_CHAR;
new_token->data.constant_char_value = chr;
continue;
}
//* identifier
if (is_ident(p[0]) && !is_numeric(p[0])) {
char *ident_start = p;
for (char c; (c = *p++) && is_ident(c);)
;
p--;
int ident_len = p - ident_start;
char *ident_str = realloc_failsafe(NULL, ident_len + 1);
for (int i = 0; i < ident_len; i++)
ident_str[i] = ident_start[i];
ident_str[ident_len] = '\0';
struct token *new_token = token_push(&tokens, &num_tokens);
new_token->kind = TOK_IDENTIFIER;
new_token->data.identifier = ident_str;
continue;
}
fprintf(stderr, "unknown token at %li\n", p - source);
return NULL;
}
struct token *new_token = token_push(&tokens, &num_tokens);
new_token->kind = TOK_END;
return tokens;
}
void free_tokens(struct token *tokens) {
for (int i = 0; tokens[i].kind != TOK_END; i++) {
switch (tokens[i].kind) {
case TOK_IDENTIFIER:
free(tokens[i].data.identifier);
break;
case TOK_CONSTANT:
switch (tokens[i].data.constant_kind) {
case CONST_STR:
free(tokens[i].data.constant_str_value);
break;
default:
break;
}
default:
break;
}
}
free(tokens);
}
#ifdef DEBUG
void debug_tokens(struct token *tokens) {
for (int i = 0; tokens[i].kind != TOK_END; i++) {
switch (tokens[i].kind) {
case TOK_IDENTIFIER:
printf("TOK_IDENTIFIER:%s, ", tokens[i].data.identifier);
break;
case TOK_KEYWORD:
printf("TOK_KEYWORD:%s, ", KEYWORD_NAMES[tokens[i].data.keyword]);
break;
case TOK_CONSTANT:
switch (tokens[i].data.constant_kind) {
case CONST_INT:
printf("TOK_CONSTANT:CONST_INT:%i, ",
tokens[i].data.constant_int_value);
break;
case CONST_STR:
printf("TOK_CONSTANT:CONST_STR:%s, ",
tokens[i].data.constant_str_value);
break;
case CONST_CHAR:
printf("TOK_CONSTANT:CONST_CHAR:%c, ",
tokens[i].data.constant_char_value);
break;
}
break;
case TOK_OPERATOR:
printf("TOK_OPERATOR:%s, ", OPERATOR_NAMES[tokens[i].data.operator] );
break;
case TOK_SEPERATOR:
printf("TOK_SEPERATOR:%s, ", SEPERATOR_NAMES[tokens[i].data.seperator]);
break;
case TOK_END:
break;
}
printf("\n");
}
printf("TOK_END\n");
}
#endif
struct node {
enum node_kind {
NO_BINOP,
NO_UNOP,
NO_MEMBER_ACCESS,
NO_STRUCT,
NO_UNION,
NO_ENUM,
NO_FUNCTION,
NO_COMPOUND,
} kind;
union node_data {
struct node_data_function {
struct node *return_type;
char *name;
int num_arguments;
struct function_argument {
char *name;
struct node *type;
} *arguments;
struct node *body;
} function;
struct node_data_compound {
int num_statements;
struct node **statements;
} compount;
struct node_data_binop {
struct node *lhs;
struct node *rhs;
} binop;
struct node_data_enum {
char *name;
char is_decl;
unsigned long num_members;
struct enum_member {
int value;
char *name;
} *members;
} enum_;
} data;
};
struct node *parse_enum_decl(int *p, struct token *tokens);
struct node *parse_type(int *p, struct token *tokens) {
struct node *out;
out = parse_enum_decl(p, tokens);
if (out)
return out;
return NULL;
}
struct node *parse_function(int *p, struct token *tokens) {
struct node *return_type = parse_type(p, tokens);
if (!return_type)
return NULL;
struct node *node = realloc_failsafe(NULL, sizeof(struct node));
node->kind = NO_FUNCTION;
node->data.function.return_type = return_type;
return node;
}
struct node *parse_enum_decl(int *p, struct token *tokens) {
if (tokens[*p].kind != TOK_KEYWORD || tokens[*p].data.keyword != KW_ENUM)
return 0;
*p += 1;
char *ident = NULL;
if (tokens[*p].kind == TOK_IDENTIFIER) {
ident = tokens[*p].data.identifier;
*p += 1;
}
char is_decl = 0;
struct enum_member *members = NULL;
unsigned long num_members = 0;
if (tokens[*p].kind == TOK_SEPERATOR &&
tokens[*p].data.seperator == SEP_LCURLY) {
*p += 1;
is_decl = 1;
int value = 0;
while (1) {
if (tokens[*p].kind == TOK_SEPERATOR &&
tokens[*p].data.seperator == SEP_RCURLY)
break;
if (tokens[*p].kind == TOK_END)
return NULL;
if (tokens[*p].kind == TOK_SEPERATOR &&
tokens[*p].data.seperator == SEP_COMMA) {
*p += 1;
continue;
}
if (tokens[*p].kind != TOK_IDENTIFIER)
return NULL;
char *m_ident = tokens[*p].data.identifier;
if (tokens[*p].kind == TOK_OPERATOR &&
tokens[*p].data.operator== OP_ASSIGN) {
*p += 1;
if (tokens[*p].kind == TOK_END)
return NULL;
if (tokens[*p].kind != TOK_CONSTANT ||
tokens[*p].data.constant_kind != CONST_INT)
return NULL;
value = tokens[*p].data.constant_int_value;
*p += 1;
}
num_members++;
members =
realloc_failsafe(members, sizeof(struct enum_member) * num_members);
struct enum_member *new_member = &members[num_members - 1];
new_member->value = value++;
new_member->name = strdup_failsafe(m_ident);
*p += 1;
}
} else if (tokens[*p].kind == TOK_SEPERATOR &&
tokens[*p].data.seperator == SEP_SEMICOLON) {
} else {
printf("expected members or semicolon\n");
return NULL;
}
struct node *node = realloc_failsafe(NULL, sizeof(struct node));
node->kind = NO_ENUM;
node->data.enum_.is_decl = is_decl;
node->data.enum_.name = strdup_failsafe(ident);
node->data.enum_.num_members = num_members;
node->data.enum_.members = members;
return node;
}
struct node *parse(int *p, struct token *tokens) {
return parse_type(p, tokens);
}
#ifdef DEBUG
void debug_node(struct node *node) {
switch (node->kind) {
case NO_ENUM:
printf("NO_ENUM\n");
printf(" name=%s\n", node->data.enum_.name);
printf(" is_decl=%i\n", node->data.enum_.is_decl);
if (node->data.enum_.is_decl) {
printf(" num_members=%li\n", node->data.enum_.num_members);
printf(" members\n");
for (int i = 0; i < node->data.enum_.num_members; i++) {
printf(" %s=%i\n", node->data.enum_.members[i].name,
node->data.enum_.members[i].value);
}
}
break;
case NO_BINOP:
printf("NO_BINOP\n");
printf(" lhs: ");
debug_node(node->data.binop.lhs);
printf("\n");
printf(" rhs: ");
debug_node(node->data.binop.rhs);
printf("\n");
break;
case NO_MEMBER_ACCESS:
printf("NO_MEMBER_ACCESS\n");
break;
case NO_FUNCTION:
printf("NO_FUNCTION\n");
break;
case NO_STRUCT:
printf("NO_STRUCT\n");
break;
case NO_UNION:
printf("NO_UNION\n");
break;
case NO_UNOP:
printf("NO_UNOP\n");
break;
default:
break;
}
}
#endif
#ifdef TEST
char test_clean = 1;
void assert(char cond, char *label) {
if (!cond) {
fprintf(stderr, "FAIL %s\n", label);
test_clean = 0;
}
}
void assert_eq(int a, int b, char *label) {
if (a != b) {
fprintf(stderr, "FAIL %s (%i == %i)\n", label, a, b);
test_clean = 0;
}
}
void test() {
assert_eq(strlen("sizeof"), 6, "strlen 1");
assert_eq(strlen(""), 0, "strlen 2");
assert_eq(strlen("a"), 1, "strlen 3");
assert(is_alpha('a'), "alpha 1");
assert(is_alpha('z'), "alpha 2");
assert(is_alpha('k'), "alpha 3");
assert(!!tokenize("enum"), "tok 1");
assert(!!tokenize("int x = 0"), "tok 1");
// assert(!!tokenize("\"Hello\""), "tok 1");
assert(!!tokenize("'\n'"), "tok 1");
}
#endif
int main(int argc, char **argv) {
#ifdef TEST
test();
return test_clean ? 0 : 1;
#endif
if (argc < 3) {
fprintf(stderr, "USAGE:\n\tattocc