wip crude syntax parser for custom language; name is temp only

This commit is contained in:
isct 2024-06-17 18:56:26 -04:00
commit 6cf9296ac6
2 changed files with 599 additions and 0 deletions

555
lapis.c Normal file
View file

@ -0,0 +1,555 @@
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
typedef struct {
enum lapis_ttype {
LAPIS_TTYPE_NONE,
LAPIS_TTYPE_IDENT,
LAPIS_TTYPE_STR,
LAPIS_TTYPE_FLOAT,
LAPIS_TTYPE_INT,
LAPIS_TTYPE_BOOL,
LAPIS_TTYPE_OPER,
} type;
union {
struct {
size_t len;
char * str;
};
double f64;
int64_t i64;
_Bool bool_;
};
} lapis_token_t;
typedef struct lapis_node_s {
lapis_token_t value;
size_t lhs_idx, rhs_idx;
} lapis_node_t;
void lapis_node_print(const lapis_node_t *nodes, size_t idx) {
if ( nodes[idx].value.type == LAPIS_TTYPE_OPER ) putchar('(');
if ( nodes[idx].lhs_idx != SIZE_MAX ) {
lapis_node_print(nodes, nodes[idx].lhs_idx);
putchar(' ');
}
switch (nodes[idx].value.type){
default: break;
case LAPIS_TTYPE_IDENT:
case LAPIS_TTYPE_STR:
case LAPIS_TTYPE_OPER:
fwrite(nodes[idx].value.str, 1, nodes[idx].value.len, stdout);
break;
case LAPIS_TTYPE_FLOAT:
printf("%lf", nodes[idx].value.f64);
break;
case LAPIS_TTYPE_INT:
printf("%ld", nodes[idx].value.i64);
break;
}
if ( nodes[idx].rhs_idx != SIZE_MAX ) {
putchar(' ');
lapis_node_print(nodes, nodes[idx].rhs_idx);
}
if ( nodes[idx].value.type == LAPIS_TTYPE_OPER ) putchar(')');
}
/// Free a token's internal buffer if it has one
void lapis_token_free(lapis_token_t *token) {
if ( token->type == LAPIS_TTYPE_IDENT || token->type == LAPIS_TTYPE_OPER ) {
if ( token->str != NULL ) {
free( token->str );
token->str = NULL;
token->len = 0;
}
}
}
lapis_token_t *lapis_parse_tokens(size_t *ntokens, FILE *f) {
size_t num_tokens = 0;
lapis_token_t *tokens = NULL;
lapis_token_t tmptok = { .type = LAPIS_TTYPE_NONE, .str = NULL, .len = 0 };
int c;
while ( (c = fgetc(f)) != EOF ) {
// skip over comments
if ( c == '#' ) {
while ( (c = fgetc(f)) != '\n' && c != EOF );
if ( c == EOF ) break;
}
enum lapis_ttype cur_ttype;
if ( (tmptok.type == LAPIS_TTYPE_IDENT && isdigit(c)) || isalpha(c) || c == '_' ) {
cur_ttype = LAPIS_TTYPE_IDENT;
} else if ( isdigit(c) ) {
cur_ttype = LAPIS_TTYPE_INT;
} else if ( c == '"' ) {
cur_ttype = LAPIS_TTYPE_STR;
} else if ( ispunct(c) ) {
cur_ttype = LAPIS_TTYPE_OPER;
} else if ( isspace(c) ) {
cur_ttype = LAPIS_TTYPE_NONE;
} else {
fprintf(stderr, "ERROR [%s]: Invalid char c='%c' at offset=%ld!\n", __func__, c, ftell(f));
goto ERROR;
}
if ( cur_ttype != tmptok.type ) {
// store the previous token if it was valid
if ( tmptok.type != LAPIS_TTYPE_NONE ) {
tokens = realloc(tokens, (num_tokens + 1) * sizeof(lapis_token_t));
if ( tokens == NULL ) {
fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__);
goto ERROR;
}
tokens[num_tokens] = tmptok;
num_tokens++;
}
// initialize new token
tmptok.type = cur_ttype;
switch (cur_ttype) {
default: break;
case LAPIS_TTYPE_IDENT:
case LAPIS_TTYPE_OPER:
case LAPIS_TTYPE_STR:
tmptok.str = NULL;
tmptok.len = 0;
break;
// NOTE: float and bool will not occur during initial tokenization
// they are only for results of reducing expressions later
//case LAPIS_TTYPE_FLOAT: tmptok.f64 = 0.; break;
//case LAPIS_TTYPE_BOOL: tmptok.bool_ = false; break;
case LAPIS_TTYPE_INT: tmptok.i64 = 0l; break;
}
}
if ( cur_ttype == LAPIS_TTYPE_STR ) {
_Bool in_esc = false;
while ( (c = fgetc(f)) != '"' || in_esc ) {
if ( c == EOF ) {
fprintf(stderr, "ERROR [%s]: EOF inside string token! offset=%ld\n", __func__, ftell(f));
goto ERROR;
}
if ( !in_esc && c == '\\' ) {
in_esc = true;
continue;
} else {
in_esc = false;
}
tmptok.str = realloc(tmptok.str, tmptok.len + 1);
if ( tmptok.str == NULL ) {
fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__);
goto ERROR;
}
tmptok.str[tmptok.len] = c;
tmptok.len++;
}
continue;
}
// extend existing token
switch (cur_ttype) {
default: break;
case LAPIS_TTYPE_IDENT:
case LAPIS_TTYPE_OPER:
tmptok.str = realloc(tmptok.str, tmptok.len + 1);
if ( tmptok.str == NULL ) {
fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__);
goto ERROR;
}
tmptok.str[tmptok.len] = c;
tmptok.len++;
break;
case LAPIS_TTYPE_INT:
tmptok.i64 *= 10;
tmptok.i64 += (c - '0');
break;
}
}
// handle float decimal point
// FIXME known bug if '.' occurs after an operator w/o a space (e.g. +.234234 fails to parse as a float)
for (size_t n=1; n < num_tokens; n++) {
if ( tokens[n].type == LAPIS_TTYPE_OPER && tokens[n].len == 1 && tokens[n].str[0] == '.' ) {
_Bool valid = false;
double val = 0.;
if ( tokens[n-1].type == LAPIS_TTYPE_INT ) {
val += tokens[n-1].i64;
tokens[n-1].type = LAPIS_TTYPE_NONE;
valid = true;
}
if ( n+1 < num_tokens && tokens[n+1].type == LAPIS_TTYPE_INT ) {
double den = 1;
while ( den < tokens[n+1].i64 ) den *= 10;
val += tokens[n+1].i64 / den;
tokens[n+1].type = LAPIS_TTYPE_NONE;
valid = true;
}
if ( !valid ) continue;
tokens[n].type = LAPIS_TTYPE_FLOAT;
tokens[n].f64 = val;
}
}
// TODO handle sci notation (e.g. 3e[+-]6,3E[+-]6,...)
*ntokens = num_tokens;
return tokens;
ERROR:
if ( tokens ) {
for (size_t n=0; n < num_tokens; n++) lapis_token_free(&tokens[n]);
free(tokens);
}
return NULL;
}
void lapis_parse(FILE *f) {
size_t ntokens=0;
lapis_token_t *tokens = lapis_parse_tokens(&ntokens, stdin);
if ( tokens == NULL ) return;
int64_t *oper_priorities = malloc( ntokens * sizeof(int64_t) );
if ( oper_priorities == NULL ) return;
// 0 - no args, 1 - lhs, 2 - rhs, 3 - both
uint8_t *oper_arg_form = malloc( ntokens * sizeof(uint8_t) );
if ( oper_arg_form == NULL ) return;
memset(oper_arg_form, 0, ntokens);
int64_t paren_level = 0;
for (size_t n=0; n < ntokens; n++) {
if ( tokens[n].type != LAPIS_TTYPE_OPER ) {
oper_priorities[n] = -1;
continue;
}
int64_t prod=1;
for (size_t k=0; k < tokens[n].len; k++) prod *= tokens[n].str[k];
switch (prod) {
default:
fputs("ERROR: unknown priority for operator '", stderr);
fwrite(tokens[n].str, 1, tokens[n].len, stderr);
fputs("'\n", stderr);
return;
case 1ul*',':
oper_priorities[n] = 1;
oper_arg_form[n] = 3;
break;
case 1ul*'=':
case 1ul*'+'*'=':
case 1ul*'-'*'=':
case 1ul*'*'*'=':
case 1ul*'/'*'=':
case 1ul*'%'*'=':
case 1ul*'<'*'<'*'=':
case 1ul*'>'*'>'*'=':
case 1ul*'&'*'=':
case 1ul*'^'*'=':
case 1ul*'|'*'=':
oper_priorities[n] = 2;
oper_arg_form[n] = 3;
break;
//?: (3)
case 1ul*'|'*'|': oper_priorities[n] = 4; oper_arg_form[n] = 3; break;
case 1ul*'&'*'&': oper_priorities[n] = 5; oper_arg_form[n] = 3; break;
case 1ul*'|': oper_priorities[n] = 6; oper_arg_form[n] = 3; break;
case 1ul*'^': oper_priorities[n] = 7; oper_arg_form[n] = 3; break;
case 1ul*'&': oper_priorities[n] = 8; oper_arg_form[n] = 3; break;
case 1ul*'='*'=':
case 1ul*'!'*'=':
oper_priorities[n] = 9;
oper_arg_form[n] = 3;
break;
case 1ul*'<':
case 1ul*'>':
case 1ul*'<'*'=':
case 1ul*'>'*'=':
oper_priorities[n] = 10;
oper_arg_form[n] = 3;
break;
case 1ul*'<'*'<':
case 1ul*'>'*'>':
oper_priorities[n] = 11;
oper_arg_form[n] = 3;
break;
case 1ul*'+':
case 1ul*'-':
// check if actual binary operator or if this is a sign
if ( n > 0 && tokens[n-1].type == LAPIS_TTYPE_OPER ) {
oper_arg_form[n] = 2; /*rhs only*/
oper_priorities[n] = 14;
} else {
oper_arg_form[n] = 3; /*binary*/
oper_priorities[n] = 12;
}
break;
case 1ul*'*':
case 1ul*'/':
case 1ul*'%':
oper_priorities[n] = 13;
oper_arg_form[n] = 3;
break;
case 1ul*':':
oper_priorities[n] = 14;
oper_arg_form[n] = 3;
break;
case 1ul*'!':
case 1ul*'~':
oper_priorities[n] = 14;
oper_arg_form[n] = 2;/*rhs only*/
break;
case 1ul*'+'*'+':
case 1ul*'-'*'-':
oper_priorities[n] = 15;
oper_arg_form[n] = 3;
break;
case 1ul*'.':
case 1ul*'-'*'>':
oper_priorities[n] = 15;
oper_arg_form[n] = 3;
break;
case 1ul*'(':
// handle function call / keyword operator
if ( n > 0 && tokens[n-1].type == LAPIS_TTYPE_IDENT ) {
// treat the function identifier as a prefix operator
tokens[n-1].type = LAPIS_TTYPE_OPER;
oper_priorities[n-1] = 15 + paren_level * 16;
oper_arg_form[n-1] = 2; // prefix-unary; takes only rhs
}
// clear the '(' as being an operator
oper_priorities[n] = -1;
tokens[n].type = LAPIS_TTYPE_NONE;
paren_level++;
break;
case 1ul*')':
if ( paren_level == 0 ) {
fprintf(stderr, "ERROR(%d): unexpected ')'\n", __LINE__);
return;
}
// clear the ')' as being an operator
oper_priorities[n] = -1;
tokens[n].type = LAPIS_TTYPE_NONE;
paren_level--;
break;
}
if ( oper_priorities[n] != -1 ) oper_priorities[n] += paren_level * 16;
}
size_t nnodes = 0;
lapis_node_t *nodes = NULL;
#define ADD_NODE(NODE) \
(nnodes++, nodes = realloc(nodes, nnodes * sizeof(lapis_node_t)), nodes[nnodes-1] = NODE)
while (1) {
// seek the next highest priority operator
int64_t max_prior = -1;
size_t max_prior_idx = SIZE_MAX;
for (size_t n=0; n < ntokens; n++) {
if ( oper_priorities[n] < 0 ) continue;
if ( oper_priorities[n] > max_prior ) {
max_prior = oper_priorities[n];
max_prior_idx = n;
}
}
// quit if no more operators were found
if ( max_prior_idx == SIZE_MAX ) break;
lapis_node_t node = {
.value = tokens[max_prior_idx],
.lhs_idx = SIZE_MAX,
.rhs_idx = SIZE_MAX
};
size_t update_lhs_idx = SIZE_MAX, update_rhs_idx = SIZE_MAX;
if ( oper_arg_form[max_prior_idx] & 1 ) { /*get lhs if needed*/
size_t n = max_prior_idx;
while ( n > 0 ) {
n--;
if ( tokens[n].type != LAPIS_TTYPE_NONE ) break;
}
if ( n == max_prior_idx || tokens[n].type == LAPIS_TTYPE_NONE ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
return;
}
if ( tokens[n].type == LAPIS_TTYPE_OPER ) {
if ( oper_priorities[n] >= -1 ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
return;
}
// This is very weird, BUT as a vile hack I encode the node index
// into the negative values of oper_priorities to simultaneously:
// a) clear the priority so the next one can be found
// b) store the index of the new node associated with the operator so
// later lower priority dependent operators can find its node.
int64_t node_idx = -(oper_priorities[n] + 2);
if ( node_idx < 0 ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
return;
}
node.lhs_idx = node_idx;
// since we are taking ownership of this node we must flag it for update
update_lhs_idx = n;
} else {
lapis_node_t tmpnode = {
.value = tokens[n],
.lhs_idx = SIZE_MAX,
.rhs_idx = SIZE_MAX
};
ADD_NODE( tmpnode );
node.lhs_idx = nnodes-1;
}
// clear token type to prevent reuse
tokens[n].type = LAPIS_TTYPE_NONE;
}
if ( oper_arg_form[max_prior_idx] & 2 ) { /*get rhs if needed*/
size_t n = max_prior_idx;
for (; n < ntokens; n++) {
if ( n == max_prior_idx ) continue;
if ( tokens[n].type != LAPIS_TTYPE_NONE ) break;
}
if ( n == max_prior_idx || tokens[n].type == LAPIS_TTYPE_NONE ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
return;
}
if ( tokens[n].type == LAPIS_TTYPE_OPER ) {
if ( oper_priorities[n] >= -1 ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
return;
}
// This is very weird, BUT as a vile hack I encode the node index
// into the negative values of oper_priorities to simultaneously:
// a) clear the priority so the next one can be found
// b) store the index of the new node associated with the operator so
// later lower priority dependent operators can find its node.
int64_t node_idx = -(oper_priorities[n] + 2);
if ( node_idx < 0 ) {
printf("ERROR: bad logic line #%d\n", __LINE__);
return;
}
node.rhs_idx = node_idx;
// since we are taking ownership of this node we must flag it for update
update_rhs_idx = n;
} else {
lapis_node_t tmpnode = {
.value = tokens[n],
.lhs_idx = SIZE_MAX,
.rhs_idx = SIZE_MAX
};
ADD_NODE( tmpnode );
node.rhs_idx = nnodes-1;
}
// clear token type to prevent reuse
tokens[n].type = LAPIS_TTYPE_NONE;
}
ADD_NODE( node );
int64_t idx = nnodes - 1;
printf("add oper token %lu w/ priority %ld as node %ld\n",
max_prior_idx, oper_priorities[max_prior_idx], idx);
// vile hack again, but on the setting side (see previous comment)
oper_priorities[max_prior_idx] = -idx - 2;
// We also have to update any previous operator nodes to point to our new
// node as well. This is because those nodes will end up double owned otherwise.
if ( update_lhs_idx != SIZE_MAX ) oper_priorities[update_lhs_idx] = -idx - 2;
if ( update_rhs_idx != SIZE_MAX ) oper_priorities[update_rhs_idx] = -idx - 2;
}
//TODO cleanup allocs, error handling, ';' or w/e to **separate expressions**
lapis_node_print(nodes, nnodes-1);
putchar('\n');
}
int main() {
#if 0
size_t ntokens=0;
lapis_token_t *tokens = lapis_parse_tokens(&ntokens, stdin);
printf("Tokens (%lu):\n", ntokens);
for (size_t n=0; n < ntokens; n++) {
switch (tokens[n].type) {
default: break;
case LAPIS_TTYPE_STR:
case LAPIS_TTYPE_IDENT:
case LAPIS_TTYPE_OPER:
fputs(" ", stdout);
fwrite(tokens[n].str, 1, tokens[n].len, stdout);
putchar('\n');
break;
case LAPIS_TTYPE_INT:
printf(" %ld (int)\n", tokens[n].i64);
break;
case LAPIS_TTYPE_FLOAT:
printf(" %lf (float)\n", tokens[n].f64);
break;
}
}
#else
lapis_parse(stdin);
#endif
return 0;
}

44
possible_syntax.lapis Normal file
View file

@ -0,0 +1,44 @@
include("std.print")
# external print implementation ex ##
print ( fmt : (compexpr | string) , values : any ... ) -> void {
last_n : int = 0
for (values) -> (value : any) {
n : int = str_finds( fmt , "%" , last_n )
static_assert( n != INT_MAX, "Number of format arguments does not match passed values!" )
putstr( stdout, fmt[last_n : n] )
switch (value) {
default: break
case (int) -> putstr( stdout, to_str_from_int( int(value) ) )
... <fill in rest>
}
last_n = n
}
}
######################################
test_func ( x : int , y : int ) -> int {
return ( x * y );
};
print( "test_func(%, %) = %" , 2, 3, test_func(2,3) );
x : int = 12345;
if ( x > 0 ) {
print( "as expected; x = %\n" , x )
}
main ( argc : int , argv : string[_] ) -> int {
return (0);
}