commit 6cf9296ac6f8a752769d2a417df93622f928699a Author: isct Date: Mon Jun 17 18:56:26 2024 -0400 wip crude syntax parser for custom language; name is temp only diff --git a/lapis.c b/lapis.c new file mode 100644 index 0000000..c3b55ca --- /dev/null +++ b/lapis.c @@ -0,0 +1,555 @@ +#include +#include +#include +#include +#include +#include + +typedef struct { + enum lapis_ttype { + LAPIS_TTYPE_NONE, + LAPIS_TTYPE_IDENT, + LAPIS_TTYPE_STR, + LAPIS_TTYPE_FLOAT, + LAPIS_TTYPE_INT, + LAPIS_TTYPE_BOOL, + LAPIS_TTYPE_OPER, + } type; + + union { + struct { + size_t len; + char * str; + }; + double f64; + int64_t i64; + _Bool bool_; + }; +} lapis_token_t; + +typedef struct lapis_node_s { + lapis_token_t value; + size_t lhs_idx, rhs_idx; +} lapis_node_t; + +void lapis_node_print(const lapis_node_t *nodes, size_t idx) { + + if ( nodes[idx].value.type == LAPIS_TTYPE_OPER ) putchar('('); + + if ( nodes[idx].lhs_idx != SIZE_MAX ) { + lapis_node_print(nodes, nodes[idx].lhs_idx); + putchar(' '); + } + + switch (nodes[idx].value.type){ + default: break; + case LAPIS_TTYPE_IDENT: + case LAPIS_TTYPE_STR: + case LAPIS_TTYPE_OPER: + fwrite(nodes[idx].value.str, 1, nodes[idx].value.len, stdout); + break; + case LAPIS_TTYPE_FLOAT: + printf("%lf", nodes[idx].value.f64); + break; + case LAPIS_TTYPE_INT: + printf("%ld", nodes[idx].value.i64); + break; + } + + if ( nodes[idx].rhs_idx != SIZE_MAX ) { + putchar(' '); + lapis_node_print(nodes, nodes[idx].rhs_idx); + } + + if ( nodes[idx].value.type == LAPIS_TTYPE_OPER ) putchar(')'); +} + +/// Free a token's internal buffer if it has one +void lapis_token_free(lapis_token_t *token) { + if ( token->type == LAPIS_TTYPE_IDENT || token->type == LAPIS_TTYPE_OPER ) { + if ( token->str != NULL ) { + free( token->str ); + token->str = NULL; + token->len = 0; + } + } +} + +lapis_token_t *lapis_parse_tokens(size_t *ntokens, FILE *f) { + + size_t num_tokens = 0; + lapis_token_t *tokens = NULL; + lapis_token_t tmptok = { .type = LAPIS_TTYPE_NONE, .str = NULL, .len = 0 }; + + int c; + while ( (c = fgetc(f)) != EOF ) { + + // skip over comments + if ( c == '#' ) { + while ( (c = fgetc(f)) != '\n' && c != EOF ); + if ( c == EOF ) break; + } + + enum lapis_ttype cur_ttype; + + if ( (tmptok.type == LAPIS_TTYPE_IDENT && isdigit(c)) || isalpha(c) || c == '_' ) { + cur_ttype = LAPIS_TTYPE_IDENT; + } else if ( isdigit(c) ) { + cur_ttype = LAPIS_TTYPE_INT; + } else if ( c == '"' ) { + cur_ttype = LAPIS_TTYPE_STR; + } else if ( ispunct(c) ) { + cur_ttype = LAPIS_TTYPE_OPER; + } else if ( isspace(c) ) { + cur_ttype = LAPIS_TTYPE_NONE; + } else { + fprintf(stderr, "ERROR [%s]: Invalid char c='%c' at offset=%ld!\n", __func__, c, ftell(f)); + goto ERROR; + } + + if ( cur_ttype != tmptok.type ) { + // store the previous token if it was valid + if ( tmptok.type != LAPIS_TTYPE_NONE ) { + tokens = realloc(tokens, (num_tokens + 1) * sizeof(lapis_token_t)); + if ( tokens == NULL ) { + fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__); + goto ERROR; + } + tokens[num_tokens] = tmptok; + num_tokens++; + } + + // initialize new token + tmptok.type = cur_ttype; + switch (cur_ttype) { + default: break; + case LAPIS_TTYPE_IDENT: + case LAPIS_TTYPE_OPER: + case LAPIS_TTYPE_STR: + tmptok.str = NULL; + tmptok.len = 0; + break; + // NOTE: float and bool will not occur during initial tokenization + // they are only for results of reducing expressions later + //case LAPIS_TTYPE_FLOAT: tmptok.f64 = 0.; break; + //case LAPIS_TTYPE_BOOL: tmptok.bool_ = false; break; + case LAPIS_TTYPE_INT: tmptok.i64 = 0l; break; + } + } + + if ( cur_ttype == LAPIS_TTYPE_STR ) { + _Bool in_esc = false; + while ( (c = fgetc(f)) != '"' || in_esc ) { + if ( c == EOF ) { + fprintf(stderr, "ERROR [%s]: EOF inside string token! offset=%ld\n", __func__, ftell(f)); + goto ERROR; + } + if ( !in_esc && c == '\\' ) { + in_esc = true; + continue; + } else { + in_esc = false; + } + tmptok.str = realloc(tmptok.str, tmptok.len + 1); + if ( tmptok.str == NULL ) { + fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__); + goto ERROR; + } + tmptok.str[tmptok.len] = c; + tmptok.len++; + } + continue; + } + + // extend existing token + switch (cur_ttype) { + default: break; + case LAPIS_TTYPE_IDENT: + case LAPIS_TTYPE_OPER: + tmptok.str = realloc(tmptok.str, tmptok.len + 1); + if ( tmptok.str == NULL ) { + fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__); + goto ERROR; + } + tmptok.str[tmptok.len] = c; + tmptok.len++; + break; + case LAPIS_TTYPE_INT: + tmptok.i64 *= 10; + tmptok.i64 += (c - '0'); + break; + } + } + + // handle float decimal point + // FIXME known bug if '.' occurs after an operator w/o a space (e.g. +.234234 fails to parse as a float) + for (size_t n=1; n < num_tokens; n++) { + + if ( tokens[n].type == LAPIS_TTYPE_OPER && tokens[n].len == 1 && tokens[n].str[0] == '.' ) { + + _Bool valid = false; + double val = 0.; + + if ( tokens[n-1].type == LAPIS_TTYPE_INT ) { + val += tokens[n-1].i64; + tokens[n-1].type = LAPIS_TTYPE_NONE; + valid = true; + } + + if ( n+1 < num_tokens && tokens[n+1].type == LAPIS_TTYPE_INT ) { + double den = 1; + while ( den < tokens[n+1].i64 ) den *= 10; + val += tokens[n+1].i64 / den; + tokens[n+1].type = LAPIS_TTYPE_NONE; + valid = true; + } + + if ( !valid ) continue; + + tokens[n].type = LAPIS_TTYPE_FLOAT; + tokens[n].f64 = val; + } + } + + // TODO handle sci notation (e.g. 3e[+-]6,3E[+-]6,...) + + *ntokens = num_tokens; + return tokens; +ERROR: + if ( tokens ) { + for (size_t n=0; n < num_tokens; n++) lapis_token_free(&tokens[n]); + free(tokens); + } + return NULL; +} + +void lapis_parse(FILE *f) { + + size_t ntokens=0; + lapis_token_t *tokens = lapis_parse_tokens(&ntokens, stdin); + if ( tokens == NULL ) return; + + int64_t *oper_priorities = malloc( ntokens * sizeof(int64_t) ); + if ( oper_priorities == NULL ) return; + + // 0 - no args, 1 - lhs, 2 - rhs, 3 - both + uint8_t *oper_arg_form = malloc( ntokens * sizeof(uint8_t) ); + if ( oper_arg_form == NULL ) return; + + memset(oper_arg_form, 0, ntokens); + + int64_t paren_level = 0; + + for (size_t n=0; n < ntokens; n++) { + + if ( tokens[n].type != LAPIS_TTYPE_OPER ) { + oper_priorities[n] = -1; + continue; + } + + int64_t prod=1; + for (size_t k=0; k < tokens[n].len; k++) prod *= tokens[n].str[k]; + + switch (prod) { + default: + fputs("ERROR: unknown priority for operator '", stderr); + fwrite(tokens[n].str, 1, tokens[n].len, stderr); + fputs("'\n", stderr); + return; + case 1ul*',': + oper_priorities[n] = 1; + oper_arg_form[n] = 3; + break; + case 1ul*'=': + case 1ul*'+'*'=': + case 1ul*'-'*'=': + case 1ul*'*'*'=': + case 1ul*'/'*'=': + case 1ul*'%'*'=': + case 1ul*'<'*'<'*'=': + case 1ul*'>'*'>'*'=': + case 1ul*'&'*'=': + case 1ul*'^'*'=': + case 1ul*'|'*'=': + oper_priorities[n] = 2; + oper_arg_form[n] = 3; + break; + //?: (3) + case 1ul*'|'*'|': oper_priorities[n] = 4; oper_arg_form[n] = 3; break; + case 1ul*'&'*'&': oper_priorities[n] = 5; oper_arg_form[n] = 3; break; + case 1ul*'|': oper_priorities[n] = 6; oper_arg_form[n] = 3; break; + case 1ul*'^': oper_priorities[n] = 7; oper_arg_form[n] = 3; break; + case 1ul*'&': oper_priorities[n] = 8; oper_arg_form[n] = 3; break; + case 1ul*'='*'=': + case 1ul*'!'*'=': + oper_priorities[n] = 9; + oper_arg_form[n] = 3; + break; + case 1ul*'<': + case 1ul*'>': + case 1ul*'<'*'=': + case 1ul*'>'*'=': + oper_priorities[n] = 10; + oper_arg_form[n] = 3; + break; + case 1ul*'<'*'<': + case 1ul*'>'*'>': + oper_priorities[n] = 11; + oper_arg_form[n] = 3; + break; + case 1ul*'+': + case 1ul*'-': + // check if actual binary operator or if this is a sign + if ( n > 0 && tokens[n-1].type == LAPIS_TTYPE_OPER ) { + oper_arg_form[n] = 2; /*rhs only*/ + oper_priorities[n] = 14; + } else { + oper_arg_form[n] = 3; /*binary*/ + oper_priorities[n] = 12; + } + break; + case 1ul*'*': + case 1ul*'/': + case 1ul*'%': + oper_priorities[n] = 13; + oper_arg_form[n] = 3; + break; + case 1ul*':': + oper_priorities[n] = 14; + oper_arg_form[n] = 3; + break; + case 1ul*'!': + case 1ul*'~': + oper_priorities[n] = 14; + oper_arg_form[n] = 2;/*rhs only*/ + break; + case 1ul*'+'*'+': + case 1ul*'-'*'-': + oper_priorities[n] = 15; + oper_arg_form[n] = 3; + break; + case 1ul*'.': + case 1ul*'-'*'>': + oper_priorities[n] = 15; + oper_arg_form[n] = 3; + break; + case 1ul*'(': + // handle function call / keyword operator + if ( n > 0 && tokens[n-1].type == LAPIS_TTYPE_IDENT ) { + // treat the function identifier as a prefix operator + tokens[n-1].type = LAPIS_TTYPE_OPER; + oper_priorities[n-1] = 15 + paren_level * 16; + oper_arg_form[n-1] = 2; // prefix-unary; takes only rhs + } + // clear the '(' as being an operator + oper_priorities[n] = -1; + tokens[n].type = LAPIS_TTYPE_NONE; + paren_level++; + break; + case 1ul*')': + if ( paren_level == 0 ) { + fprintf(stderr, "ERROR(%d): unexpected ')'\n", __LINE__); + return; + } + // clear the ')' as being an operator + oper_priorities[n] = -1; + tokens[n].type = LAPIS_TTYPE_NONE; + paren_level--; + break; + } + + if ( oper_priorities[n] != -1 ) oper_priorities[n] += paren_level * 16; + } + + size_t nnodes = 0; + lapis_node_t *nodes = NULL; + +#define ADD_NODE(NODE) \ + (nnodes++, nodes = realloc(nodes, nnodes * sizeof(lapis_node_t)), nodes[nnodes-1] = NODE) + + while (1) { + + // seek the next highest priority operator + + int64_t max_prior = -1; + size_t max_prior_idx = SIZE_MAX; + + for (size_t n=0; n < ntokens; n++) { + + if ( oper_priorities[n] < 0 ) continue; + + if ( oper_priorities[n] > max_prior ) { + max_prior = oper_priorities[n]; + max_prior_idx = n; + } + } + + // quit if no more operators were found + + if ( max_prior_idx == SIZE_MAX ) break; + + lapis_node_t node = { + .value = tokens[max_prior_idx], + .lhs_idx = SIZE_MAX, + .rhs_idx = SIZE_MAX + }; + + size_t update_lhs_idx = SIZE_MAX, update_rhs_idx = SIZE_MAX; + + if ( oper_arg_form[max_prior_idx] & 1 ) { /*get lhs if needed*/ + + size_t n = max_prior_idx; + while ( n > 0 ) { + n--; + if ( tokens[n].type != LAPIS_TTYPE_NONE ) break; + } + + if ( n == max_prior_idx || tokens[n].type == LAPIS_TTYPE_NONE ) { + printf("ERROR: bad logic line #%d\n", __LINE__); + return; + } + + if ( tokens[n].type == LAPIS_TTYPE_OPER ) { + + if ( oper_priorities[n] >= -1 ) { + printf("ERROR: bad logic line #%d\n", __LINE__); + return; + } + + // This is very weird, BUT as a vile hack I encode the node index + // into the negative values of oper_priorities to simultaneously: + // a) clear the priority so the next one can be found + // b) store the index of the new node associated with the operator so + // later lower priority dependent operators can find its node. + int64_t node_idx = -(oper_priorities[n] + 2); + + if ( node_idx < 0 ) { + printf("ERROR: bad logic line #%d\n", __LINE__); + return; + } + + node.lhs_idx = node_idx; + + // since we are taking ownership of this node we must flag it for update + update_lhs_idx = n; + + } else { + + lapis_node_t tmpnode = { + .value = tokens[n], + .lhs_idx = SIZE_MAX, + .rhs_idx = SIZE_MAX + }; + + ADD_NODE( tmpnode ); + node.lhs_idx = nnodes-1; + } + + // clear token type to prevent reuse + tokens[n].type = LAPIS_TTYPE_NONE; + } + + if ( oper_arg_form[max_prior_idx] & 2 ) { /*get rhs if needed*/ + + size_t n = max_prior_idx; + for (; n < ntokens; n++) { + if ( n == max_prior_idx ) continue; + if ( tokens[n].type != LAPIS_TTYPE_NONE ) break; + } + + if ( n == max_prior_idx || tokens[n].type == LAPIS_TTYPE_NONE ) { + printf("ERROR: bad logic line #%d\n", __LINE__); + return; + } + + if ( tokens[n].type == LAPIS_TTYPE_OPER ) { + + if ( oper_priorities[n] >= -1 ) { + printf("ERROR: bad logic line #%d\n", __LINE__); + return; + } + + // This is very weird, BUT as a vile hack I encode the node index + // into the negative values of oper_priorities to simultaneously: + // a) clear the priority so the next one can be found + // b) store the index of the new node associated with the operator so + // later lower priority dependent operators can find its node. + int64_t node_idx = -(oper_priorities[n] + 2); + + if ( node_idx < 0 ) { + printf("ERROR: bad logic line #%d\n", __LINE__); + return; + } + + node.rhs_idx = node_idx; + + // since we are taking ownership of this node we must flag it for update + update_rhs_idx = n; + + } else { + + lapis_node_t tmpnode = { + .value = tokens[n], + .lhs_idx = SIZE_MAX, + .rhs_idx = SIZE_MAX + }; + + ADD_NODE( tmpnode ); + node.rhs_idx = nnodes-1; + } + + // clear token type to prevent reuse + tokens[n].type = LAPIS_TTYPE_NONE; + } + + ADD_NODE( node ); + int64_t idx = nnodes - 1; + + printf("add oper token %lu w/ priority %ld as node %ld\n", + max_prior_idx, oper_priorities[max_prior_idx], idx); + + // vile hack again, but on the setting side (see previous comment) + oper_priorities[max_prior_idx] = -idx - 2; + + // We also have to update any previous operator nodes to point to our new + // node as well. This is because those nodes will end up double owned otherwise. + if ( update_lhs_idx != SIZE_MAX ) oper_priorities[update_lhs_idx] = -idx - 2; + if ( update_rhs_idx != SIZE_MAX ) oper_priorities[update_rhs_idx] = -idx - 2; + } + + //TODO cleanup allocs, error handling, ';' or w/e to **separate expressions** + + lapis_node_print(nodes, nnodes-1); + putchar('\n'); +} + +int main() { +#if 0 + size_t ntokens=0; + lapis_token_t *tokens = lapis_parse_tokens(&ntokens, stdin); + + printf("Tokens (%lu):\n", ntokens); + for (size_t n=0; n < ntokens; n++) { + switch (tokens[n].type) { + default: break; + case LAPIS_TTYPE_STR: + case LAPIS_TTYPE_IDENT: + case LAPIS_TTYPE_OPER: + fputs(" ", stdout); + fwrite(tokens[n].str, 1, tokens[n].len, stdout); + putchar('\n'); + break; + case LAPIS_TTYPE_INT: + printf(" %ld (int)\n", tokens[n].i64); + break; + case LAPIS_TTYPE_FLOAT: + printf(" %lf (float)\n", tokens[n].f64); + break; + } + } +#else + lapis_parse(stdin); +#endif + + return 0; +} diff --git a/possible_syntax.lapis b/possible_syntax.lapis new file mode 100644 index 0000000..7632a63 --- /dev/null +++ b/possible_syntax.lapis @@ -0,0 +1,44 @@ +include("std.print") + +# external print implementation ex ## +print ( fmt : (compexpr | string) , values : any ... ) -> void { + + last_n : int = 0 + + for (values) -> (value : any) { + + n : int = str_finds( fmt , "%" , last_n ) + + static_assert( n != INT_MAX, "Number of format arguments does not match passed values!" ) + + putstr( stdout, fmt[last_n : n] ) + + switch (value) { + default: break + case (int) -> putstr( stdout, to_str_from_int( int(value) ) ) + ... + } + + last_n = n + } +} +###################################### + + + +test_func ( x : int , y : int ) -> int { + return ( x * y ); +}; + +print( "test_func(%, %) = %" , 2, 3, test_func(2,3) ); + +x : int = 12345; + +if ( x > 0 ) { + print( "as expected; x = %\n" , x ) +} + +main ( argc : int , argv : string[_] ) -> int { + + return (0); +}