/*
attocc - A minimal C compiler.
Copyright (C) 2024 metamuffin
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
const int NUM_KEYWORDS = 32;
static char *KEYWORDS[] = {
"auto", "break", "case", "char", "const", "continue",
"default", "do", "double", "else", "enum", "extern",
"float", "for", "goto", "if", "int", "long",
"register", "return", "short", "signed", "sizeof", "static",
"struct", "switch", "typedef", "union", "unsigned", "void",
"volatile", "while"};
static char *SEPERATORS = "()[]{};,:";
const int NUM_OPERATORS = 36;
static char *OPERATORS[] = {
"sizeof", "<<", ">>", "+=", "-=", "*=", "/=", "%=", "<<=",
">>=", "&=", "|=", "^=", "++", "--", "==", "!=", "<=",
">=", "||", "&&", "+", "-", "*", "/", "%", "<",
">", "~", "&", "|", "^", "!", "=", ".", "?",
};
#ifdef DEBUG
static char *OPERATOR_NAMES[] = {
"OP_SIZEOF",
"OP_SHIFT_LEFT",
"OP_SHIFT_RIGHT",
"OP_ADD_ASSIGN",
"OP_SUB_ASSIGN",
"OP_MUL_ASSIGN",
"OP_DIV_ASSIGN",
"OP_MOD_ASSIGN",
"OP_SHIFT_LEFT_ASSIGN",
"OP_SHIFT_RIGHT_ASSIGN",
"OP_BITWISE_AND_ASSIGN",
"OP_BITWISE_OR_ASSIGN",
"OP_BITWISE_XOR_ASSIGN",
"OP_INCREMENT",
"OP_DECREMENT",
"OP_EQUAL",
"OP_NOT_EQUAL",
"OP_LESS_EQUAL",
"OP_GREATER_EQUAL",
"OP_LOGICAL_OR",
"OP_LOGICAL_AND",
"OP_ADD",
"OP_SUB",
"OP_MUL_OR_POINTER",
"OP_DIV",
"OP_MOD",
"OP_LESS",
"OP_GREATER",
"OP_BITWISE_NOT",
"OP_BITWISE_AND_OR_REF",
"OP_BITWISE_OR",
"OP_BITWISE_XOR",
"OP_LOGICAL_NOT",
"OP_ASSIGN",
"OP_MEMBER_ACCESS",
"OP_TERNARY",
};
static char *KEYWORD_NAMES[] = {
"KW_AUTO", "KW_BREAK", "KW_CASE", "KW_CHAR", "KW_CONST",
"KW_CONTINUE", "KW_DEFAULT", "KW_DO", "KW_DOUBLE", "KW_ELSE",
"KW_ENUM", "KW_EXTERN", "KW_FLOAT", "KW_FOR", "KW_GOTO",
"KW_IF", "KW_INT", "KW_LONG", "KW_REGISTER", "KW_RETURN",
"KW_SHORT", "KW_SIGNED", "KW_SIZEOF", "KW_STATIC", "KW_STRUCT",
"KW_SWITCH", "KW_TYPEDEF", "KW_UNION", "KW_UNSIGNED", "KW_VOID",
"KW_VOLATILE", "KW_WHILE",
};
static char *SEPERATOR_NAMES[] = {
"SEP_LPAREN", "SEP_RPAREN", "SEP_LSQUARE", "SEP_RSQUARE", "SEP_LCURLY",
"SEP_RCURLY", "SEP_SEMICOLON", "SEP_COMMA", "SEP_DOT", "SEP_COLOR",
};
#endif
enum keyword {
KW_AUTO,
KW_BREAK,
KW_CASE,
KW_CHAR,
KW_CONST,
KW_CONTINUE,
KW_DEFAULT,
KW_DO,
KW_DOUBLE,
KW_ELSE,
KW_ENUM,
KW_EXTERN,
KW_FLOAT,
KW_FOR,
KW_GOTO,
KW_IF,
KW_INT,
KW_LONG,
KW_REGISTER,
KW_RETURN,
KW_SHORT,
KW_SIGNED,
KW_SIZEOF,
KW_STATIC,
KW_STRUCT,
KW_SWITCH,
KW_TYPEDEF,
KW_UNION,
KW_UNSIGNED,
KW_VOID,
KW_VOLATILE,
KW_WHILE,
};
enum operator{
OP_SIZEOF, // len=6
OP_SHIFT_LEFT, // len=2
OP_SHIFT_RIGHT,
OP_ADD_ASSIGN,
OP_SUB_ASSIGN,
OP_MUL_ASSIGN,
OP_DIV_ASSIGN,
OP_MOD_ASSIGN,
OP_SHIFT_LEFT_ASSIGN,
OP_SHIFT_RIGHT_ASSIGN,
OP_BITWISE_AND_ASSIGN,
OP_BITWISE_OR_ASSIGN,
OP_BITWISE_XOR_ASSIGN,
OP_INCREMENT,
OP_DECREMENT,
OP_EQUAL,
OP_NOT_EQUAL,
OP_LESS_EQUAL,
OP_GREATER_EQUAL,
OP_LOGICAL_OR,
OP_LOGICAL_AND,
OP_ADD, // len=1
OP_SUB,
OP_MUL_OR_POINTER,
OP_DIV,
OP_MOD,
OP_LESS,
OP_GREATER,
OP_BITWISE_NOT,
OP_BITWISE_AND_OR_REF,
OP_BITWISE_OR,
OP_BITWISE_XOR,
OP_LOGICAL_NOT,
OP_ASSIGN,
OP_MEMBER_ACCESS,
OP_TERNARY,
};
enum seperator {
SEP_LPAREN,
SEP_RPAREN,
SEP_LSQUARE,
SEP_RSQUARE,
SEP_LCURLY,
SEP_RCURLY,
SEP_SEMICOLON,
SEP_COMMA,
SEP_DOT,
SEP_COLOR,
};
enum token_kind {
TOK_IDENTIFIER,
TOK_KEYWORD,
TOK_CONSTANT,
TOK_SEPERATOR,
TOK_OPERATOR,
TOK_END,
};
enum constant_kind {
CONST_INT,
CONST_STR,
CONST_CHAR,
};
union token_data {
char *identifier;
enum keyword keyword;
struct {
enum constant_kind constant_kind;
union {
int constant_int_value;
char constant_char_value;
char *constant_str_value;
};
};
enum seperator seperator;
enum operator operator;
};
struct token {
enum token_kind kind;
union token_data data;
};
char is_numeric(char c) { return c >= '0' && c <= '9'; }
char is_octal(char c) { return c >= '0' && c <= '7'; }
char is_hexadecial(char c) {
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f');
}
char is_alpha(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}
char is_alphanumeric(char c) { return is_alpha(c) || is_numeric(c); }
char is_ident(char c) { return is_alphanumeric(c) || c == '_'; }
char map_escape(char c) {
switch (c) {
case 'n':
return '\n';
case 't':
return '\t';
case 'b':
return '\b';
case 'r':
return '\r';
case '\'':
return '\'';
case '"':
return '"';
case '0':
return '\0';
default:
return '\0';
}
}
struct token *token_push(struct token **tokens, unsigned long *num_tokens) {
*num_tokens += 1;
*tokens = realloc(*tokens, sizeof(struct token) * *num_tokens);
if (!tokens) {
fprintf(stderr, "realloc failed\n");
return NULL;
}
return &(*tokens)[*num_tokens - 1];
}
struct token *tokenize(char *source) {
char *end = source + strlen(source);
char *p = source;
unsigned long num_tokens = 0;
struct token *tokens = NULL;
while (*p) {
unsigned long remaining = end - p;
//* whitespace
if (p[0] == ' ' || p[0] == '\t' || p[0] == '\n') {
p++;
continue;
}
//* comments
if (remaining >= 2) {
if (p[0] == '/' && p[1] == '/') {
p += 2;
for (char c; (c = *p++) && c != '\n';)
;
continue;
} else if (p[0] == '/' && p[1] == '*') {
p += 2;
for (char c, d = '\0'; (d = c, c = *p++) && !(c == '/' && d == '*');)
;
continue;
} else if (p[0] == '#') {
p += 1;
for (char c; (c = *p++) && c != '\n';)
;
continue;
}
}
//* seperators
char match = 0;
for (int i = 0; SEPERATORS[i]; i++) {
if (p[0] == SEPERATORS[i]) {
p++;
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_SEPERATOR;
new_token->data.seperator = i;
match = 1;
break;
}
}
if (match)
continue;
//* operators
match = 0;
for (int i = 0; i < NUM_OPERATORS; i++) {
char *op = OPERATORS[i];
if (remaining >= strlen(op)) {
if (strncmp(op, p, strlen(op)) == 0) {
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_OPERATOR;
new_token->data.operator= i;
p += strlen(op);
match = 1;
break;
}
}
}
if (match)
continue;
//* keyword
match = 0;
for (int i = 0; i < NUM_KEYWORDS; i++) {
char *kw = KEYWORDS[i];
if (remaining >= strlen(kw) + 1) {
if (strncmp(kw, p, strlen(kw)) == 0 && !is_ident(p[strlen(kw)])) {
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_KEYWORD;
new_token->data.keyword = i;
p += strlen(kw);
match = 1;
break;
}
}
}
if (match)
continue;
//* number constant
if (is_numeric(p[0])) {
int value = 0;
if (remaining >= 2 && p[1] == 'x') {
p += 2;
for (char c; (c = *p++) && is_hexadecial(c);) {
value *= 0x10;
value += c <= '9' ? c - '0' : 10 + (c <= 'F' ? c - 'A' : c - 'a');
}
} else if (p[0] == '0') {
p += 1;
for (char c; (c = *p++) && is_octal(c);) {
value *= 010;
value += c - '0';
}
} else {
for (char c; (c = *p++) && is_numeric(c);) {
value *= 10;
value += c - '0';
}
}
p--;
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_CONSTANT;
new_token->data.constant_kind = CONST_INT;
new_token->data.constant_int_value = value;
continue;
}
//* string constant
if (p[0] == '"') {
p++;
char *str = NULL;
int str_len = 0;
for (char c; (c = *p++) && c != '"';) {
// TODO escape
str_len++;
str = realloc(str, str_len);
if (!str) {
fprintf(stderr, "realloc failed\n");
return NULL;
}
str[str_len - 1] = c;
}
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_CONSTANT;
new_token->data.constant_kind = CONST_STR;
new_token->data.constant_str_value = str;
continue;
}
//* char constant
if (p[0] == '\'') {
if (!*p++)
return NULL;
char chr = p[0];
if (p[0] == '\\') {
if (!*p++)
return NULL;
chr = map_escape(p[0]);
}
if (!*p++)
return NULL;
if (*p++ != '\'')
return fprintf(stderr, "expected '\n"), NULL;
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_CONSTANT;
new_token->data.constant_kind = CONST_CHAR;
new_token->data.constant_char_value = chr;
continue;
}
//* identifier
if (is_ident(p[0]) && !is_numeric(p[0])) {
char *ident_start = p;
for (char c; (c = *p++) && is_ident(c);)
;
int ident_len = p - ident_start - 1;
char *ident_str = malloc(ident_len + 1);
if (!ident_str) {
fprintf(stderr, "malloc failed\n");
return NULL;
}
for (int i = 0; i < ident_len; i++)
ident_str[i] = ident_start[i];
ident_str[ident_len] = '\0';
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_IDENTIFIER;
new_token->data.identifier = ident_str;
continue;
}
fprintf(stderr, "unknown token at %li\n", p - source);
return NULL;
}
struct token *new_token = token_push(&tokens, &num_tokens);
if (!new_token)
return NULL;
new_token->kind = TOK_END;
return tokens;
}
void free_tokens(struct token *tokens) {
for (int i = 0; tokens[i].kind != TOK_END; i++) {
switch (tokens[i].kind) {
case TOK_IDENTIFIER:
free(tokens[i].data.identifier);
break;
case TOK_CONSTANT:
switch (tokens[i].data.constant_kind) {
case CONST_STR:
free(tokens[i].data.constant_str_value);
break;
default:
break;
}
default:
break;
}
}
free(tokens);
}
#ifdef DEBUG
void debug_tokens(struct token *tokens) {
for (int i = 0; tokens[i].kind != TOK_END; i++) {
switch (tokens[i].kind) {
case TOK_IDENTIFIER:
if (tokens[i].data.identifier < 100)
printf("TOK_IDENTIFIER:%p, ", tokens[i].data.identifier);
else
printf("TOK_IDENTIFIER:%s, ", tokens[i].data.identifier);
break;
case TOK_KEYWORD:
printf("TOK_KEYWORD:%s, ", KEYWORD_NAMES[tokens[i].data.keyword]);
break;
case TOK_CONSTANT:
switch (tokens[i].data.constant_kind) {
case CONST_INT:
printf("TOK_CONSTANT:CONST_INT:%i, ",
tokens[i].data.constant_int_value);
break;
case CONST_STR:
printf("TOK_CONSTANT:CONST_STR:%s, ",
tokens[i].data.constant_str_value);
break;
case CONST_CHAR:
printf("TOK_CONSTANT:CONST_CHAR:%c, ",
tokens[i].data.constant_char_value);
break;
}
break;
case TOK_OPERATOR:
printf("TOK_OPERATOR:%s, ", OPERATOR_NAMES[tokens[i].data.operator] );
break;
case TOK_SEPERATOR:
printf("TOK_SEPERATOR:%s, ", SEPERATOR_NAMES[tokens[i].data.seperator]);
break;
case TOK_END:
break;
}
printf("\n");
}
printf("TOK_END\n");
}
#endif
enum node_kind {
NO_BINOP,
NO_UNOP,
NO_MEMBER_ACCESS,
NO_STRUCT,
NO_UNION,
NO_ENUM,
NO_FUNCTION
};
union node_data {
struct {
struct expr *lhs;
struct expr *rhs;
} binop;
};
struct node {
enum node_kind kind;
union node_data data;
};
char parse_enum_decl(struct token *tokens, struct node **node) {
int p = 0;
if (tokens[p].kind != TOK_KEYWORD || tokens[p].data.keyword != KW_ENUM)
return 1;
p++;
char *ident = NULL;
if (tokens[p].kind == TOK_IDENTIFIER) {
ident = tokens[p].data.identifier;
p++;
}
if (tokens[p].kind != TOK_SEPERATOR || tokens[p].data.seperator != SEP_LCURLY)
return 1;
p++;
*node = malloc(sizeof(struct node));
if (!*node) {
fprintf(stderr, "malloc failed\n");
return 1;
}
// (**node).data.;
return 0;
}
struct decl *parse(struct token *tokens) { return NULL; }
int main(int argc, char **argv) {
if (argc < 3) {
fprintf(stderr, "USAGE:\n\tattocc