#include #include #include #include #include #include typedef struct { enum lapis_ttype { LAPIS_TTYPE_NONE, LAPIS_TTYPE_IDENT, LAPIS_TTYPE_STR, LAPIS_TTYPE_FLOAT, LAPIS_TTYPE_INT, LAPIS_TTYPE_BOOL, LAPIS_TTYPE_OPER, } type; union { struct { size_t len; char * str; }; double f64; int64_t i64; _Bool bool_; }; } lapis_token_t; typedef struct lapis_node_s { lapis_token_t value; size_t lhs_idx, rhs_idx; } lapis_node_t; void lapis_node_print(const lapis_node_t *nodes, size_t idx) { if ( nodes[idx].value.type == LAPIS_TTYPE_OPER ) putchar('('); if ( nodes[idx].lhs_idx != SIZE_MAX ) { lapis_node_print(nodes, nodes[idx].lhs_idx); putchar(' '); } switch (nodes[idx].value.type){ default: break; case LAPIS_TTYPE_IDENT: case LAPIS_TTYPE_STR: case LAPIS_TTYPE_OPER: fwrite(nodes[idx].value.str, 1, nodes[idx].value.len, stdout); break; case LAPIS_TTYPE_FLOAT: printf("%lf", nodes[idx].value.f64); break; case LAPIS_TTYPE_INT: printf("%ld", nodes[idx].value.i64); break; } if ( nodes[idx].rhs_idx != SIZE_MAX ) { putchar(' '); lapis_node_print(nodes, nodes[idx].rhs_idx); } if ( nodes[idx].value.type == LAPIS_TTYPE_OPER ) putchar(')'); } /// Free a token's internal buffer if it has one void lapis_token_free(lapis_token_t *token) { if ( token->type == LAPIS_TTYPE_IDENT || token->type == LAPIS_TTYPE_OPER ) { if ( token->str != NULL ) { free( token->str ); token->str = NULL; token->len = 0; } } } lapis_token_t *lapis_parse_tokens(size_t *ntokens, FILE *f) { size_t num_tokens = 0; lapis_token_t *tokens = NULL; lapis_token_t tmptok = { .type = LAPIS_TTYPE_NONE, .str = NULL, .len = 0 }; _Bool end_of_expression = false; int c; while ( (c = fgetc(f)) != EOF && !end_of_expression ) { _Bool in_escape = false; // skip over comments if ( c == '#' ) { while ( (c = fgetc(f)) != '\n' && c != EOF ); if ( c == EOF ) break; } // detect line escapes if ( c == '\\' ) { in_escape = true; c = fgetc(f); if ( c == EOF ) break; } if ( c == ';' || (!in_escape && c == '\n') ) { c = ' '; if ( num_tokens ) end_of_expression = true; } enum lapis_ttype cur_ttype; if ( (tmptok.type == LAPIS_TTYPE_IDENT && isdigit(c)) || isalpha(c) || c == '_' ) { cur_ttype = LAPIS_TTYPE_IDENT; } else if ( isdigit(c) ) { cur_ttype = LAPIS_TTYPE_INT; } else if ( c == '"' ) { cur_ttype = LAPIS_TTYPE_STR; } else if ( ispunct(c) ) { cur_ttype = LAPIS_TTYPE_OPER; } else if ( isspace(c) ) { cur_ttype = LAPIS_TTYPE_NONE; } else { fprintf(stderr, "ERROR [%s]: Invalid char c='%c' at offset=%ld!\n", __func__, c, ftell(f)); goto ERROR; } if ( cur_ttype != tmptok.type ) { // store the previous token if it was valid if ( tmptok.type != LAPIS_TTYPE_NONE ) { tokens = realloc(tokens, (num_tokens + 1) * sizeof(lapis_token_t)); if ( tokens == NULL ) { fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__); goto ERROR; } tokens[num_tokens] = tmptok; num_tokens++; } // initialize new token tmptok.type = cur_ttype; switch (cur_ttype) { default: break; case LAPIS_TTYPE_IDENT: case LAPIS_TTYPE_OPER: case LAPIS_TTYPE_STR: tmptok.str = NULL; tmptok.len = 0; break; // NOTE: float and bool will not occur during initial tokenization // they are only for results of reducing expressions later //case LAPIS_TTYPE_FLOAT: tmptok.f64 = 0.; break; //case LAPIS_TTYPE_BOOL: tmptok.bool_ = false; break; case LAPIS_TTYPE_INT: tmptok.i64 = 0l; break; } } if ( cur_ttype == LAPIS_TTYPE_STR ) { _Bool in_esc = false; while ( (c = fgetc(f)) != '"' || in_esc ) { if ( c == EOF ) { fprintf(stderr, "ERROR [%s]: EOF inside string token! offset=%ld\n", __func__, ftell(f)); goto ERROR; } if ( !in_esc && c == '\\' ) { in_esc = true; continue; } else { in_esc = false; } tmptok.str = realloc(tmptok.str, tmptok.len + 1); if ( tmptok.str == NULL ) { fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__); goto ERROR; } tmptok.str[tmptok.len] = c; tmptok.len++; } continue; } // extend existing token switch (cur_ttype) { default: break; case LAPIS_TTYPE_IDENT: case LAPIS_TTYPE_OPER: tmptok.str = realloc(tmptok.str, tmptok.len + 1); if ( tmptok.str == NULL ) { fprintf(stderr, "ERROR [%s:%d]: realloc failed!\n", __func__, __LINE__); goto ERROR; } tmptok.str[tmptok.len] = c; tmptok.len++; break; case LAPIS_TTYPE_INT: tmptok.i64 *= 10; tmptok.i64 += (c - '0'); break; } } // handle float decimal point // FIXME known bug if '.' occurs after an operator w/o a space (e.g. +.234234 fails to parse as a float) for (size_t n=1; n < num_tokens; n++) { if ( tokens[n].type == LAPIS_TTYPE_OPER && tokens[n].len == 1 && tokens[n].str[0] == '.' ) { _Bool valid = false; double val = 0.; if ( tokens[n-1].type == LAPIS_TTYPE_INT ) { val += tokens[n-1].i64; tokens[n-1].type = LAPIS_TTYPE_NONE; valid = true; } if ( n+1 < num_tokens && tokens[n+1].type == LAPIS_TTYPE_INT ) { double den = 1; while ( den < tokens[n+1].i64 ) den *= 10; val += tokens[n+1].i64 / den; tokens[n+1].type = LAPIS_TTYPE_NONE; valid = true; } if ( !valid ) continue; tokens[n].type = LAPIS_TTYPE_FLOAT; tokens[n].f64 = val; } } // TODO handle sci notation (e.g. 3e[+-]6,3E[+-]6,...) *ntokens = num_tokens; return tokens; ERROR: if ( tokens ) { for (size_t n=0; n < num_tokens; n++) lapis_token_free(&tokens[n]); free(tokens); } return NULL; } void lapis_parse(FILE *f) { size_t ntokens=0; lapis_token_t *tokens = lapis_parse_tokens(&ntokens, stdin); if ( tokens == NULL ) return; int64_t *oper_priorities = malloc( ntokens * sizeof(int64_t) ); if ( oper_priorities == NULL ) return; // 0 - no args, 1 - lhs, 2 - rhs, 3 - both uint8_t *oper_arg_form = malloc( ntokens * sizeof(uint8_t) ); if ( oper_arg_form == NULL ) return; memset(oper_arg_form, 0, ntokens); int64_t paren_level = 0; for (size_t n=0; n < ntokens; n++) { if ( tokens[n].type != LAPIS_TTYPE_OPER ) { oper_priorities[n] = -1; continue; } int64_t prod=1; for (size_t k=0; k < tokens[n].len; k++) prod *= tokens[n].str[k]; switch (prod) { default: fputs("ERROR: unknown priority for operator '", stderr); fwrite(tokens[n].str, 1, tokens[n].len, stderr); fputs("'\n", stderr); return; case 1ul*',': oper_priorities[n] = 1; oper_arg_form[n] = 3; break; case 1ul*'=': case 1ul*'+'*'=': case 1ul*'-'*'=': case 1ul*'*'*'=': case 1ul*'/'*'=': case 1ul*'%'*'=': case 1ul*'<'*'<'*'=': case 1ul*'>'*'>'*'=': case 1ul*'&'*'=': case 1ul*'^'*'=': case 1ul*'|'*'=': oper_priorities[n] = 2; oper_arg_form[n] = 3; break; //?: (3) case 1ul*'|'*'|': oper_priorities[n] = 4; oper_arg_form[n] = 3; break; case 1ul*'&'*'&': oper_priorities[n] = 5; oper_arg_form[n] = 3; break; case 1ul*'|': oper_priorities[n] = 6; oper_arg_form[n] = 3; break; case 1ul*'^': oper_priorities[n] = 7; oper_arg_form[n] = 3; break; case 1ul*'&': oper_priorities[n] = 8; oper_arg_form[n] = 3; break; case 1ul*'='*'=': case 1ul*'!'*'=': oper_priorities[n] = 9; oper_arg_form[n] = 3; break; case 1ul*'<': case 1ul*'>': case 1ul*'<'*'=': case 1ul*'>'*'=': oper_priorities[n] = 10; oper_arg_form[n] = 3; break; case 1ul*'<'*'<': case 1ul*'>'*'>': oper_priorities[n] = 11; oper_arg_form[n] = 3; break; case 1ul*'+': case 1ul*'-': // check if actual binary operator or if this is a sign if ( n > 0 && tokens[n-1].type == LAPIS_TTYPE_OPER ) { oper_arg_form[n] = 2; /*rhs only*/ oper_priorities[n] = 14; } else { oper_arg_form[n] = 3; /*binary*/ oper_priorities[n] = 12; } break; case 1ul*'*': case 1ul*'/': case 1ul*'%': oper_priorities[n] = 13; oper_arg_form[n] = 3; break; case 1ul*':': oper_priorities[n] = 14; oper_arg_form[n] = 3; break; case 1ul*'!': case 1ul*'~': oper_priorities[n] = 14; oper_arg_form[n] = 2;/*rhs only*/ break; case 1ul*'+'*'+': case 1ul*'-'*'-': oper_priorities[n] = 15; oper_arg_form[n] = 3; break; case 1ul*'.': case 1ul*'-'*'>': oper_priorities[n] = 15; oper_arg_form[n] = 3; break; case 1ul*'(': // handle function call / keyword operator if ( n > 0 && tokens[n-1].type == LAPIS_TTYPE_IDENT ) { // treat the function identifier as a prefix operator tokens[n-1].type = LAPIS_TTYPE_OPER; oper_priorities[n-1] = 15 + paren_level * 16; oper_arg_form[n-1] = 2; // prefix-unary; takes only rhs } // clear the '(' as being an operator oper_priorities[n] = -1; tokens[n].type = LAPIS_TTYPE_NONE; paren_level++; break; case 1ul*')': if ( paren_level == 0 ) { fprintf(stderr, "ERROR(%d): unexpected ')'\n", __LINE__); return; } // clear the ')' as being an operator oper_priorities[n] = -1; tokens[n].type = LAPIS_TTYPE_NONE; paren_level--; break; } if ( oper_priorities[n] != -1 ) oper_priorities[n] += paren_level * 16; } size_t nnodes = 0; lapis_node_t *nodes = NULL; #define ADD_NODE(NODE) \ (nnodes++, nodes = realloc(nodes, nnodes * sizeof(lapis_node_t)), nodes[nnodes-1] = NODE) while (1) { // seek the next highest priority operator int64_t max_prior = -1; size_t max_prior_idx = SIZE_MAX; for (size_t n=0; n < ntokens; n++) { if ( oper_priorities[n] < 0 ) continue; if ( oper_priorities[n] > max_prior ) { max_prior = oper_priorities[n]; max_prior_idx = n; } } // quit if no more operators were found if ( max_prior_idx == SIZE_MAX ) break; lapis_node_t node = { .value = tokens[max_prior_idx], .lhs_idx = SIZE_MAX, .rhs_idx = SIZE_MAX }; size_t update_lhs_idx = SIZE_MAX, update_rhs_idx = SIZE_MAX; if ( oper_arg_form[max_prior_idx] & 1 ) { /*get lhs if needed*/ size_t n = max_prior_idx; while ( n > 0 ) { n--; if ( tokens[n].type != LAPIS_TTYPE_NONE ) break; } if ( n == max_prior_idx || tokens[n].type == LAPIS_TTYPE_NONE ) { printf("ERROR: bad logic line #%d\n", __LINE__); return; } if ( tokens[n].type == LAPIS_TTYPE_OPER ) { if ( oper_priorities[n] >= -1 ) { printf("ERROR: bad logic line #%d\n", __LINE__); return; } // This is very weird, BUT as a vile hack I encode the node index // into the negative values of oper_priorities to simultaneously: // a) clear the priority so the next one can be found // b) store the index of the new node associated with the operator so // later lower priority dependent operators can find its node. int64_t node_idx = -(oper_priorities[n] + 2); if ( node_idx < 0 ) { printf("ERROR: bad logic line #%d\n", __LINE__); return; } node.lhs_idx = node_idx; // since we are taking ownership of this node we must flag it for update update_lhs_idx = n; } else { lapis_node_t tmpnode = { .value = tokens[n], .lhs_idx = SIZE_MAX, .rhs_idx = SIZE_MAX }; ADD_NODE( tmpnode ); node.lhs_idx = nnodes-1; } // clear token type to prevent reuse tokens[n].type = LAPIS_TTYPE_NONE; } if ( oper_arg_form[max_prior_idx] & 2 ) { /*get rhs if needed*/ size_t n = max_prior_idx; for (; n < ntokens; n++) { if ( n == max_prior_idx ) continue; if ( tokens[n].type != LAPIS_TTYPE_NONE ) break; } if ( n == max_prior_idx || tokens[n].type == LAPIS_TTYPE_NONE ) { printf("ERROR: bad logic line #%d\n", __LINE__); return; } if ( tokens[n].type == LAPIS_TTYPE_OPER ) { if ( oper_priorities[n] >= -1 ) { printf("ERROR: bad logic line #%d\n", __LINE__); return; } // This is very weird, BUT as a vile hack I encode the node index // into the negative values of oper_priorities to simultaneously: // a) clear the priority so the next one can be found // b) store the index of the new node associated with the operator so // later lower priority dependent operators can find its node. int64_t node_idx = -(oper_priorities[n] + 2); if ( node_idx < 0 ) { printf("ERROR: bad logic line #%d\n", __LINE__); return; } node.rhs_idx = node_idx; // since we are taking ownership of this node we must flag it for update update_rhs_idx = n; } else { lapis_node_t tmpnode = { .value = tokens[n], .lhs_idx = SIZE_MAX, .rhs_idx = SIZE_MAX }; ADD_NODE( tmpnode ); node.rhs_idx = nnodes-1; } // clear token type to prevent reuse tokens[n].type = LAPIS_TTYPE_NONE; } ADD_NODE( node ); int64_t idx = nnodes - 1; printf("add oper token %lu w/ priority %ld as node %ld\n", max_prior_idx, oper_priorities[max_prior_idx], idx); // vile hack again, but on the setting side (see previous comment) oper_priorities[max_prior_idx] = -idx - 2; // We also have to update any previous operator nodes to point to our new // node as well. This is because those nodes will end up double owned otherwise. if ( update_lhs_idx != SIZE_MAX ) oper_priorities[update_lhs_idx] = -idx - 2; if ( update_rhs_idx != SIZE_MAX ) oper_priorities[update_rhs_idx] = -idx - 2; } //TODO cleanup allocs, error handling, ';' or w/e to **separate expressions** lapis_node_print(nodes, nnodes-1); putchar('\n'); } int main() { #if 0 size_t ntokens=0; lapis_token_t *tokens = lapis_parse_tokens(&ntokens, stdin); printf("Tokens (%lu):\n", ntokens); for (size_t n=0; n < ntokens; n++) { switch (tokens[n].type) { default: break; case LAPIS_TTYPE_STR: case LAPIS_TTYPE_IDENT: case LAPIS_TTYPE_OPER: fputs(" ", stdout); fwrite(tokens[n].str, 1, tokens[n].len, stdout); putchar('\n'); break; case LAPIS_TTYPE_INT: printf(" %ld (int)\n", tokens[n].i64); break; case LAPIS_TTYPE_FLOAT: printf(" %lf (float)\n", tokens[n].f64); break; } } #else lapis_parse(stdin); #endif return 0; }