/*
attocc - A minimal C compiler.
Copyright (C) 2024 metamuffin
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
const int NUM_KEYWORDS = 32;
static char *KEYWORDS[] = {
"auto", "break", "case", "char", "const", "continue",
"default", "do", "double", "else", "enum", "extern",
"float", "for", "goto", "if", "int", "long",
"register", "return", "short", "signed", "sizeof", "static",
"struct", "switch", "typedef", "union", "unsigned", "void",
"volatile", "while"};
static char *SEPERATORS = "()[]{};,:";
const int NUM_OPERATORS = 36;
static char *OPERATORS[] = {
"sizeof", "<<", ">>", "+=", "-=", "*=", "/=", "%=", "<<=",
">>=", "&=", "|=", "^=", "++", "--", "==", "!=", "<=",
">=", "||", "&&", "+", "-", "*", "/", "%", "<",
">", "~", "&", "|", "^", "!", "&", "*", "=",
};
enum keyword {
kW_AUTO,
KW_BREAK,
KW_CASE,
KW_CHAR,
KW_CONST,
KW_CONTINUE,
KW_DEFAULT,
KW_DO,
KW_DOUBLE,
KW_ELSE,
KW_ENUM,
KW_EXTERN,
KW_FLOAT,
KW_FOR,
KW_GOTO,
KW_IF,
KW_INT,
KW_LONG,
KW_REGISTER,
KW_RETURN,
KW_SHORT,
KW_SIGNED,
KW_SIZEOF,
KW_STATIC,
KW_STRUCT,
KW_SWITCH,
KW_TYPEDEF,
KW_UNION,
KW_UNSIGNED,
KW_VOID,
KW_VOLATILE,
KW_WHILE,
};
enum operator{
OP_SIZEOF, // len=6
OP_SHIFT_LEFT, // len=2
OP_SHIFT_RIGHT,
OP_ADD_ASSIGN,
OP_SUB_ASSIGN,
OP_MUL_ASSIGN,
OP_DIV_ASSIGN,
OP_MOD_ASSIGN,
OP_SHIFT_LEFT_ASSIGN,
OP_SHIFT_RIGHT_ASSIGN,
OP_BITWISE_AND_ASSIGN,
OP_BITWISE_OR_ASSIGN,
OP_BITWISE_XOR_ASSIGN,
OP_INCREMENT,
OP_DECREMENT,
OP_EQUAL,
OP_NOT_EQUAL,
OP_LESS_EQUAL,
OP_GREATER_EQUAL,
OP_LOGICAL_OR,
OP_LOGICAL_AND,
OP_ADD, // len=1
OP_SUB,
OP_MUL,
OP_DIV,
OP_MOD,
OP_LESS,
OP_GREATER,
OP_BITWISE_NOT,
OP_BITWISE_AND,
OP_BITWISE_OR,
OP_BITWISE_XOR,
OP_LOGICAL_NOT,
OP_POINTER_REF,
OP_POINTER_DEREF,
OP_ASSIGN,
};
enum seperator {
SEP_LPAREN,
SEP_RPAREN,
SEP_LSQUARE,
SEP_RSQUARE,
SEP_LCURLY,
SEP_RCURLY,
SEP_SEMICOLON,
SEP_COMMA,
SEP_DOT,
SEP_COLOR,
};
enum token_kind {
TOK_IDENTIFIER,
TOK_KEYWORD,
TOK_CONSTANT,
TOK_SEPERATOR,
TOK_OPERATOR,
TOK_END,
};
union token_data {
char *identifier;
enum keyword keyword;
int constant_value;
enum seperator seperator;
enum operator operator;
};
struct token {
enum token_kind kind;
union token_data data;
};
char is_numeric(char c) { return (c >= '0' && c <= '9'); }
char is_alpha(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}
char is_alphanumeric(char c) { return is_alpha(c) || is_numeric(c); }
struct token *tokenize(char *source) {
char *end = source + strlen(source);
char *p = source;
unsigned long num_tokens = 0;
struct token *tokens = NULL;
while (*p) {
unsigned long remaining = end - p;
// printf("tok %c\n", p[0]);
//* whitespace
if (p[0] == ' ' || p[0] == '\t' || p[0] == '\n') {
p++;
continue;
}
//* seperators
for (int i = 0; SEPERATORS[i]; i++) {
if (p[0] == SEPERATORS[i]) {
num_tokens += 1;
tokens = realloc(tokens, sizeof(struct token) * num_tokens);
if (!tokens) {
fprintf(stderr, "realloc failed\n");
return NULL;
}
struct token *new_token = &tokens[num_tokens - 1];
new_token->kind = TOK_SEPERATOR;
new_token->data.seperator = i;
}
}
//* operators
char match = 0;
for (int i = 0; i < NUM_OPERATORS; i++) {
char *op = OPERATORS[i];
if (remaining >= strlen(op)) {
if (strncmp(op, p, strlen(op)) == 0) {
num_tokens += 1;
tokens = realloc(tokens, sizeof(struct token) * num_tokens);
if (!tokens) {
fprintf(stderr, "realloc failed\n");
return NULL;
}
struct token *new_token = &tokens[num_tokens - 1];
new_token->kind = TOK_OPERATOR;
new_token->data.operator= i;
p += strlen(op);
match = 1;
break;
}
}
}
if (match) {
continue;
}
//* comments
if (remaining >= 2) {
if (p[0] == '/' && p[1] == '/') {
p += 2;
for (char c; (c = *p++) && c != '\n';)
;
continue;
} else if (p[0] == '/' && p[1] == '*') {
p += 2;
for (char c, d = '\0'; (d = c, c = *p++) && c == '/' && d == '*';)
;
continue;
} else if (p[0] == '#') {
p += 1;
for (char c; (c = *p++) && c != '\n';)
;
continue;
}
}
//* keyword
match = 0;
for (int i = 0; i < NUM_KEYWORDS; i++) {
char *kw = KEYWORDS[i];
if (remaining >= strlen(kw)) {
if (strncmp(kw, p, strlen(kw)) == 0) {
num_tokens += 1;
tokens = realloc(tokens, sizeof(struct token) * num_tokens);
if (!tokens) {
fprintf(stderr, "realloc failed\n");
return NULL;
}
struct token *new_token = &tokens[num_tokens - 1];
new_token->kind = TOK_KEYWORD;
new_token->data.keyword = i;
// printf("kw match %i %li\n", i, strlen(kw));
p += strlen(kw);
match = 1;
break;
}
}
}
if (match) {
continue;
}
if (is_alpha(p[0])) {
char *ident_start = p;
p++;
for (char c; (c = *p++) && is_alphanumeric(c);)
;
int ident_len = p - ident_start - 1;
char *ident_str = malloc(ident_len + 1);
if (!ident_str) {
fprintf(stderr, "malloc failed\n");
return NULL;
}
for (int i = 0; i < ident_len; i++)
ident_str[i] = p[i];
ident_str[ident_len] = '\0';
num_tokens += 1;
tokens = realloc(tokens, sizeof(struct token) * num_tokens);
if (!tokens) {
fprintf(stderr, "realloc failed\n");
return NULL;
}
struct token *new_token = &tokens[num_tokens - 1];
new_token->kind = TOK_IDENTIFIER;
new_token->data.identifier = ident_str;
continue;
}
fprintf(stderr, "unknown token at %li\n", p - source);
printf("%s", p);
return NULL;
}
num_tokens += 1;
tokens = realloc(tokens, sizeof(struct token) * num_tokens);
if (!tokens) {
fprintf(stderr, "realloc failed\n");
return NULL;
}
struct token *new_token = &tokens[num_tokens - 1];
new_token->kind = TOK_END;
return tokens;
}
void debug_tokens(struct token *tokens) {
for (int i = 0; tokens[i].kind != TOK_END; i++) {
switch (tokens[i].kind) {
case TOK_IDENTIFIER:
printf("TOK_IDENTIFIER, ");
break;
case TOK_KEYWORD:
printf("TOK_KEYWORD, ");
break;
case TOK_CONSTANT:
printf("TOK_CONSTANT, ");
break;
case TOK_OPERATOR:
printf("TOK_OPERATOR, ");
break;
case TOK_SEPERATOR:
printf("TOK_SEPERATOR, ");
break;
case TOK_END:
break;
}
}
printf("TOK_END\n");
}
int main(int argc, char **argv) {
if (argc < 3) {
fprintf(stderr, "USAGE:\n\tattocc