/*
attocc - A minimal C compiler.
Copyright (C) 2025 metamuffin
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#ifdef LINT
#define TEST
#define DEBUG
#endif
#define TRAP kill(getpid(), SIGTRAP);
const int NUM_KEYWORDS = 32;
static char *KEYWORDS[] = {
"auto", "break", "case", "char", "const", "continue",
"default", "do", "double", "else", "enum", "extern",
"float", "for", "goto", "if", "int", "long",
"register", "return", "short", "signed", "sizeof", "static",
"struct", "switch", "typedef", "union", "unsigned", "void",
"volatile", "while"};
static char *SEPERATORS = "()[]{};,:";
const int NUM_OPERATORS = 36;
static char *OPERATORS[] = {
"sizeof", "<<", ">>", "+=", "-=", "*=", "/=", "%=", "<<=",
">>=", "&=", "|=", "^=", "++", "--", "==", "!=", "<=",
">=", "||", "&&", "+", "-", "*", "/", "%", "<",
">", "~", "&", "|", "^", "!", "=", ".", "?",
};
#ifdef DEBUG
static char *OPERATOR_NAMES[] = {
"OP_SIZEOF",
"OP_SHIFT_LEFT",
"OP_SHIFT_RIGHT",
"OP_ADD_ASSIGN",
"OP_SUB_ASSIGN",
"OP_MUL_ASSIGN",
"OP_DIV_ASSIGN",
"OP_MOD_ASSIGN",
"OP_SHIFT_LEFT_ASSIGN",
"OP_SHIFT_RIGHT_ASSIGN",
"OP_BITWISE_AND_ASSIGN",
"OP_BITWISE_OR_ASSIGN",
"OP_BITWISE_XOR_ASSIGN",
"OP_INCREMENT",
"OP_DECREMENT",
"OP_EQUAL",
"OP_NOT_EQUAL",
"OP_LESS_EQUAL",
"OP_GREATER_EQUAL",
"OP_LOGICAL_OR",
"OP_LOGICAL_AND",
"OP_ADD",
"OP_SUB",
"OP_MUL_OR_POINTER",
"OP_DIV",
"OP_MOD",
"OP_LESS",
"OP_GREATER",
"OP_BITWISE_NOT",
"OP_BITWISE_AND_OR_REF",
"OP_BITWISE_OR",
"OP_BITWISE_XOR",
"OP_LOGICAL_NOT",
"OP_ASSIGN",
"OP_MEMBER_ACCESS",
"OP_TERNARY",
};
static char *KEYWORD_NAMES[] = {
"KW_AUTO", "KW_BREAK", "KW_CASE", "KW_CHAR", "KW_CONST",
"KW_CONTINUE", "KW_DEFAULT", "KW_DO", "KW_DOUBLE", "KW_ELSE",
"KW_ENUM", "KW_EXTERN", "KW_FLOAT", "KW_FOR", "KW_GOTO",
"KW_IF", "KW_INT", "KW_LONG", "KW_REGISTER", "KW_RETURN",
"KW_SHORT", "KW_SIGNED", "KW_SIZEOF", "KW_STATIC", "KW_STRUCT",
"KW_SWITCH", "KW_TYPEDEF", "KW_UNION", "KW_UNSIGNED", "KW_VOID",
"KW_VOLATILE", "KW_WHILE",
};
static char *SEPERATOR_NAMES[] = {
"SEP_LPAREN", "SEP_RPAREN", "SEP_LSQUARE", "SEP_RSQUARE", "SEP_LCURLY",
"SEP_RCURLY", "SEP_SEMICOLON", "SEP_COMMA", "SEP_DOT", "SEP_COLOR",
};
#endif
enum keyword {
KW_AUTO,
KW_BREAK,
KW_CASE,
KW_CHAR,
KW_CONST,
KW_CONTINUE,
KW_DEFAULT,
KW_DO,
KW_DOUBLE,
KW_ELSE,
KW_ENUM,
KW_EXTERN,
KW_FLOAT,
KW_FOR,
KW_GOTO,
KW_IF,
KW_INT,
KW_LONG,
KW_REGISTER,
KW_RETURN,
KW_SHORT,
KW_SIGNED,
KW_SIZEOF,
KW_STATIC,
KW_STRUCT,
KW_SWITCH,
KW_TYPEDEF,
KW_UNION,
KW_UNSIGNED,
KW_VOID,
KW_VOLATILE,
KW_WHILE,
};
enum operator{
OP_SIZEOF, // len=6
OP_SHIFT_LEFT, // len=2
OP_SHIFT_RIGHT,
OP_ADD_ASSIGN,
OP_SUB_ASSIGN,
OP_MUL_ASSIGN,
OP_DIV_ASSIGN,
OP_MOD_ASSIGN,
OP_SHIFT_LEFT_ASSIGN,
OP_SHIFT_RIGHT_ASSIGN,
OP_BITWISE_AND_ASSIGN,
OP_BITWISE_OR_ASSIGN,
OP_BITWISE_XOR_ASSIGN,
OP_INCREMENT,
OP_DECREMENT,
OP_EQUAL,
OP_NOT_EQUAL,
OP_LESS_EQUAL,
OP_GREATER_EQUAL,
OP_LOGICAL_OR,
OP_LOGICAL_AND,
OP_ADD, // len=1
OP_SUB,
OP_MUL_OR_POINTER,
OP_DIV,
OP_MOD,
OP_LESS,
OP_GREATER,
OP_BITWISE_NOT,
OP_BITWISE_AND_OR_REF,
OP_BITWISE_OR,
OP_BITWISE_XOR,
OP_LOGICAL_NOT,
OP_ASSIGN,
OP_MEMBER_ACCESS,
OP_TERNARY,
};
enum seperator {
SEP_LPAREN,
SEP_RPAREN,
SEP_LSQUARE,
SEP_RSQUARE,
SEP_LCURLY,
SEP_RCURLY,
SEP_SEMICOLON,
SEP_COMMA,
SEP_DOT,
SEP_COLOR,
};
enum token_kind {
TOK_END,
TOK_IDENTIFIER,
TOK_KEYWORD,
TOK_CONSTANT,
TOK_SEPERATOR,
TOK_OPERATOR,
TOK_ERROR,
};
enum constant_kind {
CONST_INT,
CONST_STR,
CONST_CHAR,
};
union token_data {
char *identifier;
enum keyword keyword;
struct {
enum constant_kind constant_kind;
union {
int constant_int_value;
char constant_char_value;
char *constant_str_value;
};
};
enum seperator seperator;
enum operator operator;
char *error_message;
};
struct token {
enum token_kind kind;
union token_data data;
unsigned long position;
};
char is_numeric(char c) { return c >= '0' && c <= '9'; }
char is_octal(char c) { return c >= '0' && c <= '7'; }
char is_hexadecial(char c) {
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f');
}
char is_alpha(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}
char is_alphanumeric(char c) { return is_alpha(c) || is_numeric(c); }
char is_ident(char c) { return is_alphanumeric(c) || c == '_'; }
char map_escape(char c) {
switch (c) {
case 'n':
return '\n';
case 't':
return '\t';
case 'b':
return '\b';
case 'r':
return '\r';
case '\'':
return '\'';
case '"':
return '"';
case '0':
return '\0';
default:
return '\0';
}
}
void *realloc_failsafe(void *old, unsigned long size) {
void *p = realloc(old, size);
if (!p) {
fprintf(stderr, "malloc failed for size %lu\n", size);
exit(2);
}
return p;
}
void *memcpy(void *dest, const void *src, unsigned long size) {
char *dest_ = (char *)dest;
char *src_ = (char *)src;
while (size--)
*dest_++ = *src_++;
return dest;
}
// not to specification but this program only needs to check equality
int strncmp(const char *a, const char *b, unsigned long len) {
while (len--) {
if (*a != *b)
return *a - *b;
a++;
b++;
}
return 0;
}
unsigned long strlen(const char *s) {
int len = 0;
while (*s++)
len++;
return len;
}
char *strdup(const char *s) {
int len = strlen(s);
char *new_s = realloc(NULL, len + 1);
if (!new_s)
return NULL;
memcpy(new_s, s, len + 1);
return new_s;
}
char *strdup_failsafe(char *s) {
char *p = strdup(s);
if (!p) {
fprintf(stderr, "strdup failed\n");
exit(2);
}
return p;
}
struct error {
char *message;
unsigned long position;
};
struct linemap {
unsigned long *lines;
};
struct linemap linemap_new(char *source) {
struct linemap lm;
unsigned long i = 0;
lm.lines = realloc_failsafe(NULL, sizeof(unsigned long));
int num_lines = 1;
lm.lines[0] = 0;
while (source[i]) {
if (source[i] == '\n') {
num_lines += 1;
lm.lines = realloc_failsafe(lm.lines, num_lines * sizeof(unsigned long));
lm.lines[num_lines - 1] = i;
}
i += 1;
}
lm.lines =
realloc_failsafe(lm.lines, (num_lines + 1) * sizeof(unsigned long));
lm.lines[num_lines] = 0xffffffff;
return lm;
}
int linemap_find(struct linemap *lm, unsigned long position) {
int line = 0;
while (lm->lines[line] < position)
line += 1;
return line;
}
void print_error(struct error error, char *filename, struct linemap *linemap) {
int line = linemap_find(linemap, error.position);
int column = error.position - linemap->lines[line];
if (!error.message)
error.message = "";
printf("error: %s\n", error.message);
printf("\tin %s:%i:%i\n", filename, line + 1, column + 1);
}
struct token *token_push(struct token **tokens, unsigned long *num_tokens) {
*num_tokens += 1;
*tokens = realloc_failsafe(*tokens, sizeof(struct token) * *num_tokens);
return &(*tokens)[*num_tokens - 1];
}
struct token_iter {
char *end;
char *start;
char *p;
};
struct token_iter token_iter_new(char *source) {
struct token_iter iter;
iter.end = source + strlen(source);
iter.start = source;
iter.p = source;
return iter;
}
struct token t_error(char *message) {
struct token t;
t.kind = TOK_ERROR;
t.data.error_message = message;
return t;
}
struct token token_iter_next(struct token_iter *iter) {
char *p = iter->p;
unsigned long remaining = iter->end - p;
unsigned long position = p - iter->start;
struct token tok;
tok.kind = TOK_END;
while (*p) {
tok.position = position;
//* whitespace
if (p[0] == ' ' || p[0] == '\t' || p[0] == '\n') {
p++;
continue;
}
//* comments
if (remaining >= 2) {
if (p[0] == '/' && p[1] == '/') {
p += 2;
for (char c; (c = *p++) && c != '\n';)
;
continue;
} else if (p[0] == '/' && p[1] == '*') {
p += 2;
for (char c = '\0', d; (d = c, c = *p++) && !(c == '/' && d == '*');)
;
continue;
} else if (p[0] == '#') {
p += 1;
for (char c; (c = *p++) && c != '\n';)
;
continue;
}
}
//* seperators
char match = 0;
for (int i = 0; SEPERATORS[i]; i++) {
if (p[0] == SEPERATORS[i]) {
p++;
tok.kind = TOK_SEPERATOR;
tok.data.seperator = i;
match = 1;
break;
}
}
if (match)
break;
//* operators
match = 0;
for (int i = 0; i < NUM_OPERATORS; i++) {
char *op = OPERATORS[i];
if (remaining >= strlen(op)) {
if (strncmp(op, p, strlen(op)) == 0) {
p += strlen(op);
tok.kind = TOK_OPERATOR;
tok.data.operator= i;
match = 1;
break;
}
}
}
if (match)
break;
//* keyword
match = 0;
for (int i = 0; i < NUM_KEYWORDS; i++) {
char *kw = KEYWORDS[i];
if (remaining >= strlen(kw) + 1) {
if (strncmp(kw, p, strlen(kw)) == 0 && !is_ident(p[strlen(kw)])) {
p += strlen(kw);
tok.kind = TOK_KEYWORD;
tok.data.keyword = i;
match = 1;
break;
}
}
}
if (match)
break;
//* number constant
if (is_numeric(p[0])) {
int value = 0;
if (remaining >= 2 && p[1] == 'x') {
p += 2;
for (char c; (c = *p++) && is_hexadecial(c);) {
value *= 0x10;
value += c <= '9' ? c - '0' : 10 + (c <= 'F' ? c - 'A' : c - 'a');
}
} else if (p[0] == '0') {
p += 1;
for (char c; (c = *p++) && is_octal(c);) {
value *= 010;
value += c - '0';
}
} else {
for (char c; (c = *p++) && is_numeric(c);) {
value *= 10;
value += c - '0';
}
}
p--;
tok.kind = TOK_CONSTANT;
tok.data.constant_kind = CONST_INT;
tok.data.constant_int_value = value;
break;
}
//* string constant
if (p[0] == '"') {
p++;
char *str = NULL;
int str_len = 0;
for (char c; (c = *p++) && c != '"';) {
// TODO escape
str_len++;
str = realloc_failsafe(str, str_len);
str[str_len - 1] = c;
}
// null termination
str_len++;
str = realloc_failsafe(str, str_len);
str[str_len - 1] = '\0';
tok.kind = TOK_CONSTANT;
tok.data.constant_kind = CONST_STR;
tok.data.constant_str_value = str;
break;
}
//* char constant
if (p[0] == '\'') {
if (!*p++)
return t_error("eof");
char chr = p[0];
if (p[0] == '\\') {
if (!*p++)
return t_error("eof");
chr = map_escape(p[0]);
}
if (!*p++)
return t_error("eof");
if (*p++ != '\'')
return t_error("expected '\n");
tok.kind = TOK_CONSTANT;
tok.data.constant_kind = CONST_CHAR;
tok.data.constant_char_value = chr;
break;
}
//* identifier
if (is_ident(p[0]) && !is_numeric(p[0])) {
char *ident_start = p;
for (char c; (c = *p++) && is_ident(c);)
;
p--;
int ident_len = p - ident_start;
char *ident_str = realloc_failsafe(NULL, ident_len + 1);
for (int i = 0; i < ident_len; i++)
ident_str[i] = ident_start[i];
ident_str[ident_len] = '\0';
tok.kind = TOK_IDENTIFIER;
tok.data.identifier = ident_str;
break;
}
return t_error("unknown token");
}
iter->p = p;
return tok;
}
void token_free(struct token tok) {
switch (tok.kind) {
case TOK_IDENTIFIER:
free(tok.data.identifier);
break;
case TOK_CONSTANT:
switch (tok.data.constant_kind) {
case CONST_STR:
free(tok.data.constant_str_value);
break;
default:
break;
}
default:
break;
}
}
#ifdef DEBUG
void token_print(struct token tok) {
printf("%4lu ", tok.position);
switch (tok.kind) {
case TOK_IDENTIFIER:
printf("TOK_IDENTIFIER:%s", tok.data.identifier);
break;
case TOK_KEYWORD:
printf("TOK_KEYWORD:%s", KEYWORD_NAMES[tok.data.keyword]);
break;
case TOK_CONSTANT:
switch (tok.data.constant_kind) {
case CONST_INT:
printf("TOK_CONSTANT:CONST_INT:%i", tok.data.constant_int_value);
break;
case CONST_STR:
printf("TOK_CONSTANT:CONST_STR:%s", tok.data.constant_str_value);
break;
case CONST_CHAR:
printf("TOK_CONSTANT:CONST_CHAR:%c", tok.data.constant_char_value);
break;
}
break;
case TOK_OPERATOR:
printf("TOK_OPERATOR:%s", OPERATOR_NAMES[tok.data.operator] );
break;
case TOK_SEPERATOR:
printf("TOK_SEPERATOR:%s", SEPERATOR_NAMES[tok.data.seperator]);
break;
case TOK_ERROR:
printf("TOK_ERROR:%s", tok.data.error_message);
break;
case TOK_END:
printf("TOK_END");
break;
}
printf("\n");
}
#endif
int token_step(struct token_iter *iter, struct token *a) {
token_free(*a);
*a = token_iter_next(iter);
return a->kind == TOK_END || a->kind == TOK_ERROR;
}
enum type_kind {
TY_PRIMITIVE,
TY_ENUM,
TY_STRUCT,
TY_UNION,
};
struct struct_decl {};
struct union_decl {};
struct type {
enum type_kind kind;
int index;
};
struct constant {
char *name;
struct type type;
int value;
};
struct global_context {
int num_constants;
struct constant *constants;
int num_enums;
char **enums;
};
int enum_decl(struct global_context *gcx, struct token_iter *iter,
struct token *a, char *name) {
gcx->enums =
realloc_failsafe(gcx->enums, sizeof(char *) * (gcx->num_enums + 1));
int enum_index = gcx->num_enums;
gcx->enums[enum_index] = name;
gcx->num_enums += 1;
int value = 0;
while (1) {
if (a->kind == TOK_IDENTIFIER) {
char *variant_name = strdup_failsafe(a->data.identifier);
if (token_step(iter, a))
return 1;
if (a->kind == TOK_OPERATOR && a->data.operator== OP_ASSIGN) {
if (token_step(iter, a))
return 1;
if (a->kind == TOK_CONSTANT && a->data.constant_kind == CONST_INT) {
value = a->data.constant_int_value;
} else {
fprintf(stderr, "expected int const after enum assign\n");
return 1;
}
} else if (a->kind == TOK_SEPERATOR && a->data.seperator == SEP_COMMA) {
if (token_step(iter, a))
return 1;
continue;
} else {
fprintf(stderr, "expected assign or comma after enum ident\n");
return 1;
}
struct constant con;
con.type.kind = TY_ENUM;
con.type.index = enum_index;
con.value = value;
con.name = variant_name;
gcx->constants = realloc_failsafe(
gcx->constants, sizeof(struct constant) * (gcx->num_constants + 1));
gcx->constants[gcx->num_constants] = con;
gcx->num_constants += 1;
value += 1;
} else if (a->kind == TOK_SEPERATOR && a->data.seperator == SEP_RCURLY) {
if (token_step(iter, a))
return 1;
return 0;
} else {
fprintf(stderr, "expected enum ident or rcurly\n");
return 1;
}
}
}
#ifdef DEBUG
void gcx_print(struct global_context *gcx) {
printf("Constants:\n");
for (int i = 0; i < gcx->num_constants; i++) {
printf("\t%s = %i\n", gcx->constants[i].name, gcx->constants[i].value);
}
}
#endif
int toplevel(struct token_iter *iter) {
struct token a = token_iter_next(iter);
struct global_context gcx;
char *type_name = NULL;
gcx.num_constants = 0;
gcx.constants = NULL;
gcx.num_enums = 0;
gcx.enums = NULL;
if (a.kind == TOK_KEYWORD) {
switch (a.data.keyword) {
case KW_ENUM:
if (token_step(iter, &a))
return 1;
if (a.kind == TOK_IDENTIFIER) {
type_name = strdup_failsafe(a.data.identifier);
if (token_step(iter, &a))
return 1;
}
if (a.kind == TOK_SEPERATOR && a.data.seperator == SEP_LCURLY) {
if (token_step(iter, &a))
return 1;
if (enum_decl(&gcx, iter, &a, type_name))
return 1;
}
break;
default:
printf("unhandled keyword: ");
token_print(a);
return 1;
}
}
#ifdef DEBUG
gcx_print(&gcx);
#endif
return 0;
}
#ifdef TEST
char test_clean = 1;
void assert(char cond, char *label) {
if (!cond) {
fprintf(stderr, "FAIL %s\n", label);
test_clean = 0;
}
}
void assert_eq(int a, int b, char *label) {
if (a != b) {
fprintf(stderr, "FAIL %s (%i == %i)\n", label, a, b);
test_clean = 0;
}
}
void test() {
printf("sizeof(struct token) = %li\n", sizeof(struct token));
printf("sizeof(struct token_iter) = %li\n", sizeof(struct token_iter));
assert_eq(strlen("sizeof"), 6, "strlen 1");
assert_eq(strlen(""), 0, "strlen 2");
assert_eq(strlen("a"), 1, "strlen 3");
assert(is_alpha('a'), "alpha 1");
assert(is_alpha('z'), "alpha 2");
assert(is_alpha('k'), "alpha 3");
// TODO test tokenize
}
#endif
int main(int argc, char **argv) {
#ifdef TEST
test();
return test_clean ? 0 : 1;
#endif
if (argc < 3) {
fprintf(stderr, "USAGE:\n\tattocc