lang_experiment/lapis.c

684 lines
22 KiB
C

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
struct lapis_node_s;
typedef struct {
size_t nnodes;
struct lapis_node_s *nodes;
} lapis_stmt_t;
typedef struct {
enum lapis_ttype {
LAPIS_TTYPE_NONE,
LAPIS_TTYPE_IDENT,
LAPIS_TTYPE_STR,
LAPIS_TTYPE_FLOAT,
LAPIS_TTYPE_INT,
LAPIS_TTYPE_BOOL,
LAPIS_TTYPE_OPER,
LAPIS_TTYPE_STMT_LIST,
} type;
union {
struct {
size_t nstmts;
lapis_stmt_t *stmts;
};
struct {
size_t len;
char * str;
};
double f64;
int64_t i64;
_Bool bool_;
};
} lapis_token_t;
typedef struct lapis_node_s {
lapis_token_t value;
size_t lhs_idx, rhs_idx;
} lapis_node_t;
void lapis_node_print(const lapis_node_t *nodes, size_t idx) {
if ( nodes[idx].value.type == LAPIS_TTYPE_OPER ) putchar('(');
if ( nodes[idx].lhs_idx != SIZE_MAX ) {
lapis_node_print(nodes, nodes[idx].lhs_idx);
putchar(' ');
}
switch (nodes[idx].value.type){
default: break;
case LAPIS_TTYPE_IDENT:
case LAPIS_TTYPE_STR:
case LAPIS_TTYPE_OPER:
fwrite(nodes[idx].value.str, 1, nodes[idx].value.len, stdout);
break;
case LAPIS_TTYPE_FLOAT:
printf("%lf", nodes[idx].value.f64);
break;
case LAPIS_TTYPE_INT:
printf("%ld", nodes[idx].value.i64);
break;
case LAPIS_TTYPE_STMT_LIST:
for (size_t n=0; n < nodes[idx].value.nstmts; n++) {
lapis_node_print(nodes[idx].value.stmts[n].nodes, nodes[idx].value.stmts[n].nnodes-1);
fputs("; ", stdout);
}
fputs("} ", stdout);
break;
}
if ( nodes[idx].rhs_idx != SIZE_MAX ) {
putchar(' ');
lapis_node_print(nodes, nodes[idx].rhs_idx);
}
if ( nodes[idx].value.type == LAPIS_TTYPE_OPER ) putchar(')');
}
size_t lapis_node_count(const lapis_node_t *nodes, size_t idx) {
size_t count = 1;
if ( nodes[idx].lhs_idx != SIZE_MAX ) count += lapis_node_count(nodes, nodes[idx].lhs_idx);
if ( nodes[idx].rhs_idx != SIZE_MAX ) count += lapis_node_count(nodes, nodes[idx].rhs_idx);
return count;
}
/// Free a token's internal buffer if it has one
void lapis_token_free(lapis_token_t *token) {
if ( token->type == LAPIS_TTYPE_IDENT || token->type == LAPIS_TTYPE_OPER ) {
if ( token->str != NULL ) {
free( token->str );
token->str = NULL;
token->len = 0;
}
}
}
lapis_node_t *lapis_parse_expr(size_t *nnodes, FILE *f);
lapis_token_t *lapis_parse_tokens(size_t *ntokens, FILE *f) {
size_t num_tokens = 0;
lapis_token_t *tokens = NULL;
lapis_token_t tmptok = { .type = LAPIS_TTYPE_NONE, .str = NULL, .len = 0 };
_Bool end_of_expression = false;
int c;
while ( !end_of_expression && ((c = fgetc(f)) != EOF) ) {
if ( c == '{' ) {
size_t nstmts = 0;
lapis_stmt_t *stmts = NULL;
while (1) {
//puts("++++++++++");
size_t nnodes;
lapis_node_t *nodes = lapis_parse_expr(&nnodes, f);
//puts("----------");
if ( nodes == 0 ) break;
//printf("%lu: ", nstmts);
//lapis_node_print(nodes, nnodes-1);
//putchar('\n');
nstmts++;
stmts = realloc(stmts, nstmts * sizeof(lapis_stmt_t));
if ( stmts == NULL ) {
fprintf(stderr, "ERROR[%d]: realloc failed!\n", __LINE__);
goto ERROR;
}
stmts[nstmts-1] = (lapis_stmt_t) { .nnodes = nnodes, .nodes = nodes };
}
if ( nstmts > 0 ) {
tokens = realloc(tokens, (num_tokens + 2) * sizeof(lapis_token_t));
if ( tokens == NULL ) {
fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__);
goto ERROR;
}
// need an operator to connect the statement list to the outer statement
tokens[num_tokens].type = LAPIS_TTYPE_OPER;
tokens[num_tokens].str = strdup("{");
tokens[num_tokens].len = 1;
num_tokens++;
// need an operator to connect the statement list to the outer statement
tokens[num_tokens].type = LAPIS_TTYPE_STMT_LIST;
tokens[num_tokens].stmts = stmts;
tokens[num_tokens].nstmts = nstmts;
num_tokens++;
}
c = ' ';
}
_Bool in_escape = false;
// skip over comments
if ( c == '#' ) {
while ( (c = fgetc(f)) != '\n' && c != EOF );
if ( c == EOF ) break;
}
// detect line escapes
if ( c == '\\' ) {
in_escape = true;
c = fgetc(f);
if ( c == EOF ) break;
}
if ( c == ';' || c == '}' || (!in_escape && c == '\n') ) {
if ( num_tokens || c == '}' ) end_of_expression = true;
c = ' ';
}
enum lapis_ttype cur_ttype;
if ( (tmptok.type == LAPIS_TTYPE_IDENT && isdigit(c)) || isalpha(c) || c == '_' ) {
cur_ttype = LAPIS_TTYPE_IDENT;
} else if ( isdigit(c) ) {
cur_ttype = LAPIS_TTYPE_INT;
} else if ( c == '"' ) {
cur_ttype = LAPIS_TTYPE_STR;
} else if ( ispunct(c) ) {
cur_ttype = LAPIS_TTYPE_OPER;
} else if ( isspace(c) ) {
cur_ttype = LAPIS_TTYPE_NONE;
} else {
fprintf(stderr, "ERROR [%s]: Invalid char c='%c' at offset=%ld!\n", __func__, c, ftell(f));
goto ERROR;
}
if ( cur_ttype != tmptok.type ) {
// store the previous token if it was valid
if ( tmptok.type != LAPIS_TTYPE_NONE ) {
tokens = realloc(tokens, (num_tokens + 1) * sizeof(lapis_token_t));
if ( tokens == NULL ) {
fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__);
goto ERROR;
}
tokens[num_tokens] = tmptok;
num_tokens++;
}
// initialize new token
tmptok.type = cur_ttype;
switch (cur_ttype) {
default: break;
case LAPIS_TTYPE_IDENT:
case LAPIS_TTYPE_OPER:
case LAPIS_TTYPE_STR:
tmptok.str = NULL;
tmptok.len = 0;
break;
// NOTE: float and bool will not occur during initial tokenization
// they are only for results of reducing expressions later
//case LAPIS_TTYPE_FLOAT: tmptok.f64 = 0.; break;
//case LAPIS_TTYPE_BOOL: tmptok.bool_ = false; break;
case LAPIS_TTYPE_INT: tmptok.i64 = 0l; break;
}
}
if ( cur_ttype == LAPIS_TTYPE_STR ) {
_Bool in_esc = false;
while ( (c = fgetc(f)) != '"' || in_esc ) {
if ( c == EOF ) {
fprintf(stderr, "ERROR [%s]: EOF inside string token! offset=%ld\n", __func__, ftell(f));
goto ERROR;
}
if ( !in_esc && c == '\\' ) {
in_esc = true;
continue;
} else {
in_esc = false;
}
tmptok.str = realloc(tmptok.str, tmptok.len + 1);
if ( tmptok.str == NULL ) {
fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__);
goto ERROR;
}
tmptok.str[tmptok.len] = c;
tmptok.len++;
}
continue;
}
// extend existing token
switch (cur_ttype) {
default: break;
case LAPIS_TTYPE_IDENT:
case LAPIS_TTYPE_OPER:
tmptok.str = realloc(tmptok.str, tmptok.len + 1);
if ( tmptok.str == NULL ) {
fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__);
goto ERROR;
}
tmptok.str[tmptok.len] = c;
tmptok.len++;
break;
case LAPIS_TTYPE_INT:
tmptok.i64 *= 10;
tmptok.i64 += (c - '0');
break;
}
}
// handle float decimal point
// FIXME known bug if '.' occurs after an operator w/o a space (e.g. +.234234 fails to parse as a float)
for (size_t n=1; n < num_tokens; n++) {
if ( tokens[n].type == LAPIS_TTYPE_OPER && tokens[n].len == 1 && tokens[n].str[0] == '.' ) {
_Bool valid = false;
double val = 0.;
if ( tokens[n-1].type == LAPIS_TTYPE_INT ) {
val += tokens[n-1].i64;
tokens[n-1].type = LAPIS_TTYPE_NONE;
valid = true;
}
if ( n+1 < num_tokens && tokens[n+1].type == LAPIS_TTYPE_INT ) {
double den = 1;
while ( den < tokens[n+1].i64 ) den *= 10;
val += tokens[n+1].i64 / den;
tokens[n+1].type = LAPIS_TTYPE_NONE;
valid = true;
}
if ( !valid ) continue;
tokens[n].type = LAPIS_TTYPE_FLOAT;
tokens[n].f64 = val;
}
}
// TODO handle sci notation (e.g. 3e[+-]6,3E[+-]6,...)
*ntokens = num_tokens;
return tokens;
ERROR:
if ( tokens ) {
for (size_t n=0; n < num_tokens; n++) lapis_token_free(&tokens[n]);
free(tokens);
}
return NULL;
}
lapis_node_t *lapis_parse_expr(size_t *nnodes, FILE *f) {
lapis_token_t *tokens = NULL;
int64_t *oper_priorities = NULL;
uint8_t *oper_arg_form = NULL;
lapis_node_t *nodes = NULL;
size_t ntokens=0;
tokens = lapis_parse_tokens(&ntokens, stdin);
if ( tokens == NULL ) goto ERROR;
oper_priorities = malloc( ntokens * sizeof(int64_t) );
if ( oper_priorities == NULL ) goto ERROR;
// 0 - no args, 1 - lhs, 2 - rhs, 3 - both
oper_arg_form = malloc( ntokens * sizeof(uint8_t) );
if ( oper_arg_form == NULL ) goto ERROR;
memset(oper_arg_form, 0, ntokens);
size_t num_nodes = 0;
#define ADD_NODE(NODE) \
(num_nodes++, nodes = realloc(nodes, num_nodes * sizeof(lapis_node_t)), nodes[num_nodes-1] = NODE)
int64_t paren_level = 0;
for (size_t n=0; n < ntokens; n++) {
if ( tokens[n].type != LAPIS_TTYPE_OPER ) {
oper_priorities[n] = -1;
continue;
}
int64_t prod=1;
for (size_t k=0; k < tokens[n].len; k++) prod *= tokens[n].str[k];
switch (prod) {
default:
fputs("ERROR: unknown priority for operator '", stderr);
fwrite(tokens[n].str, 1, tokens[n].len, stderr);
fputs("'\n", stderr);
goto ERROR;
case 1ul*',':
case 1ul*'{':
oper_priorities[n] = 1;
oper_arg_form[n] = 3;
break;
case 1ul*'=':
case 1ul*'+'*'=':
case 1ul*'-'*'=':
case 1ul*'*'*'=':
case 1ul*'/'*'=':
case 1ul*'%'*'=':
case 1ul*'<'*'<'*'=':
case 1ul*'>'*'>'*'=':
case 1ul*'&'*'=':
case 1ul*'^'*'=':
case 1ul*'|'*'=':
oper_priorities[n] = 2;
oper_arg_form[n] = 3;
break;
//?: (3)
case 1ul*'|'*'|': oper_priorities[n] = 4; oper_arg_form[n] = 3; break;
case 1ul*'&'*'&': oper_priorities[n] = 5; oper_arg_form[n] = 3; break;
case 1ul*'|': oper_priorities[n] = 6; oper_arg_form[n] = 3; break;
case 1ul*'^': oper_priorities[n] = 7; oper_arg_form[n] = 3; break;
case 1ul*'&': oper_priorities[n] = 8; oper_arg_form[n] = 3; break;
case 1ul*'='*'=':
case 1ul*'!'*'=':
oper_priorities[n] = 9;
oper_arg_form[n] = 3;
break;
case 1ul*'<':
case 1ul*'>':
case 1ul*'<'*'=':
case 1ul*'>'*'=':
oper_priorities[n] = 10;
oper_arg_form[n] = 3;
break;
case 1ul*'<'*'<':
case 1ul*'>'*'>':
oper_priorities[n] = 11;
oper_arg_form[n] = 3;
break;
case 1ul*'+':
case 1ul*'-':
// check if actual binary operator or if this is a sign
if ( n > 0 && tokens[n-1].type == LAPIS_TTYPE_OPER ) {
oper_arg_form[n] = 2; /*rhs only*/
oper_priorities[n] = 14;
} else {
oper_arg_form[n] = 3; /*binary*/
oper_priorities[n] = 12;
}
break;
case 1ul*'*':
case 1ul*'/':
case 1ul*'%':
oper_priorities[n] = 13;
oper_arg_form[n] = 3;
break;
case 1ul*':':
oper_priorities[n] = 14;
oper_arg_form[n] = 3;
break;
case 1ul*'!':
case 1ul*'~':
oper_priorities[n] = 14;
oper_arg_form[n] = 2;/*rhs only*/
break;
case 1ul*'+'*'+':
case 1ul*'-'*'-':
oper_priorities[n] = 15;
oper_arg_form[n] = 3;
break;
case 1ul*'.':
case 1ul*'-'*'>':
oper_priorities[n] = 15;
oper_arg_form[n] = 3;
break;
case 1ul*'(':
// handle function call / keyword operator
if ( n > 0 && tokens[n-1].type == LAPIS_TTYPE_IDENT ) {
// treat the function identifier as a prefix operator
tokens[n-1].type = LAPIS_TTYPE_OPER;
oper_priorities[n-1] = 15 + paren_level * 16;
oper_arg_form[n-1] = 2; // prefix-unary; takes only rhs
}
// clear the '(' as being an operator
oper_priorities[n] = -1;
tokens[n].type = LAPIS_TTYPE_NONE;
paren_level++;
break;
case 1ul*')':
if ( paren_level == 0 ) {
fprintf(stderr, "ERROR(%d): unexpected ')'\n", __LINE__);
goto ERROR;
}
// clear the ')' as being an operator
oper_priorities[n] = -1;
tokens[n].type = LAPIS_TTYPE_NONE;
paren_level--;
break;
}
if ( oper_priorities[n] != -1 ) oper_priorities[n] += paren_level * 16;
}
while (1) {
// seek the next highest priority operator
int64_t max_prior = -1;
size_t max_prior_idx = SIZE_MAX;
for (size_t n=0; n < ntokens; n++) {
if ( oper_priorities[n] < 0 ) continue;
if ( oper_priorities[n] > max_prior ) {
max_prior = oper_priorities[n];
max_prior_idx = n;
}
}
// quit if no more operators were found
if ( max_prior_idx == SIZE_MAX ) break;
lapis_node_t node = {
.value = tokens[max_prior_idx],
.lhs_idx = SIZE_MAX,
.rhs_idx = SIZE_MAX
};
size_t update_lhs_idx = SIZE_MAX, update_rhs_idx = SIZE_MAX;
if ( oper_arg_form[max_prior_idx] & 1 ) { /*get lhs if needed*/
size_t n = max_prior_idx;
while ( n > 0 ) {
n--;
if ( tokens[n].type != LAPIS_TTYPE_NONE ) break;
}
if ( n == max_prior_idx || tokens[n].type == LAPIS_TTYPE_NONE ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
goto ERROR;
}
if ( tokens[n].type == LAPIS_TTYPE_OPER ) {
if ( oper_priorities[n] >= -1 ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
goto ERROR;
}
// This is very weird, BUT as a vile hack I encode the node index
// into the negative values of oper_priorities to simultaneously:
// a) clear the priority so the next one can be found
// b) store the index of the new node associated with the operator so
// later lower priority dependent operators can find its node.
int64_t node_idx = -(oper_priorities[n] + 2);
if ( node_idx < 0 ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
goto ERROR;
}
node.lhs_idx = node_idx;
// since we are taking ownership of this node we must flag it for update
update_lhs_idx = n;
} else {
lapis_node_t tmpnode = {
.value = tokens[n],
.lhs_idx = SIZE_MAX,
.rhs_idx = SIZE_MAX
};
ADD_NODE( tmpnode );
node.lhs_idx = num_nodes-1;
}
// clear token type to prevent reuse
tokens[n].type = LAPIS_TTYPE_NONE;
}
if ( oper_arg_form[max_prior_idx] & 2 ) { /*get rhs if needed*/
size_t n = max_prior_idx;
for (; n < ntokens; n++) {
if ( n == max_prior_idx ) continue;
if ( tokens[n].type != LAPIS_TTYPE_NONE ) break;
}
if ( n == max_prior_idx || tokens[n].type == LAPIS_TTYPE_NONE ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
goto ERROR;
}
if ( tokens[n].type == LAPIS_TTYPE_OPER ) {
if ( oper_priorities[n] >= -1 ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
goto ERROR;
}
// This is very weird, BUT as a vile hack I encode the node index
// into the negative values of oper_priorities to simultaneously:
// a) clear the priority so the next one can be found
// b) store the index of the new node associated with the operator so
// later lower priority dependent operators can find its node.
int64_t node_idx = -(oper_priorities[n] + 2);
if ( node_idx < 0 ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
goto ERROR;
}
node.rhs_idx = node_idx;
// since we are taking ownership of this node we must flag it for update
update_rhs_idx = n;
} else {
lapis_node_t tmpnode = {
.value = tokens[n],
.lhs_idx = SIZE_MAX,
.rhs_idx = SIZE_MAX
};
ADD_NODE( tmpnode );
node.rhs_idx = num_nodes-1;
}
// clear token type to prevent reuse
tokens[n].type = LAPIS_TTYPE_NONE;
}
ADD_NODE( node );
int64_t idx = num_nodes - 1;
printf("add oper token %lu w/ priority %ld as node %ld\n",
max_prior_idx, oper_priorities[max_prior_idx], idx);
// vile hack again, but on the setting side (see previous comment)
oper_priorities[max_prior_idx] = -idx - 2;
// We also have to update any previous operator nodes to point to our new
// node as well. This is because those nodes will end up double owned otherwise.
if ( update_lhs_idx != SIZE_MAX ) oper_priorities[update_lhs_idx] = -idx - 2;
if ( update_rhs_idx != SIZE_MAX ) oper_priorities[update_rhs_idx] = -idx - 2;
}
if ( lapis_node_count(nodes, num_nodes-1) != num_nodes ) {
// NOTE: this likely indicates a priority conflict of some kind between operators
fprintf(stderr, "ERROR[%d]: Unknown syntax error! Orphan nodes detected!\n", __LINE__);
goto ERROR;
}
if ( tokens != NULL ) free( tokens );
if ( oper_priorities != NULL ) free( oper_priorities );
if ( oper_arg_form != NULL ) free( oper_arg_form );
*nnodes = num_nodes;
return nodes;
ERROR:
if ( tokens != NULL ) free( tokens );
if ( oper_priorities != NULL ) free( oper_priorities );
if ( oper_arg_form != NULL ) free( oper_arg_form );
if ( nodes != NULL ) free( nodes );
*nnodes = 0;
return NULL;
}
int main() {
#if 0
size_t ntokens=0;
lapis_token_t *tokens = lapis_parse_tokens(&ntokens, stdin);
printf("Tokens (%lu):\n", ntokens);
for (size_t n=0; n < ntokens; n++) {
switch (tokens[n].type) {
default: break;
case LAPIS_TTYPE_STR:
case LAPIS_TTYPE_IDENT:
case LAPIS_TTYPE_OPER:
fputs(" ", stdout);
fwrite(tokens[n].str, 1, tokens[n].len, stdout);
putchar('\n');
break;
case LAPIS_TTYPE_INT:
printf(" %ld (int)\n", tokens[n].i64);
break;
case LAPIS_TTYPE_FLOAT:
printf(" %lf (float)\n", tokens[n].f64);
break;
}
}
#else
while (1) {
size_t nnodes;
lapis_node_t *nodes = lapis_parse_expr(&nnodes, stdin);
if ( nnodes == 0 ) break;
lapis_node_print(nodes, nnodes-1);
putchar('\n');
}
#endif
return 0;
}