From be69ebfaf5dc61d0299fc0ca3663c0e1d5e3bf79 Mon Sep 17 00:00:00 2001 From: icst Date: Thu, 27 Jun 2024 22:36:30 -0400 Subject: [PATCH] wip system c header parser --- c-hdr-parser.c | 655 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 655 insertions(+) create mode 100644 c-hdr-parser.c diff --git a/c-hdr-parser.c b/c-hdr-parser.c new file mode 100644 index 0000000..3891dc7 --- /dev/null +++ b/c-hdr-parser.c @@ -0,0 +1,655 @@ +#include +#include +#include +#include +#include +#include + +/// realloc s1 and append s2 to it (modifies only s1) +char *cstr_append(char *s1, const char *s2) { + s1 = realloc(s1, strlen(s1) + strlen(s2) + 1); + if ( s1 == NULL ) return NULL; + strcat(s1, s2); + return s1; +} + +/** + * Open a stream of the desired header (after 1st stage processing of the given CC program) + * which includes all other headers it also includes and in addition the compiler evaluates + * all macros and cleans out all comments for us. + */ +FILE *open_c_header(const char *cc_cmd, const char *hdr_fn) { + + char *cmd = strdup("echo \"#include <"); + cmd = cstr_append(cmd, hdr_fn); + if ( cmd == NULL ) return NULL; + cmd = cstr_append(cmd, ">\" | "); + if ( cmd == NULL ) return NULL; + cmd = cstr_append(cmd, cc_cmd); + if ( cmd == NULL ) return NULL; + cmd = cstr_append(cmd, " -E -"); + if ( cmd == NULL ) return NULL; + + FILE *f = popen(cmd, "r"); + + free(cmd); + + return f; +} + +/// Parse a series of keywords/identifiers until a non-kw/id is found indicating the end +char ** parse_c_header_identkws(size_t *nidkws, int *c, FILE *f) { + + *nidkws = 0; + char ** idkws = NULL; + + size_t cur_kw_len = 0; + char * cur_kw = NULL; + + while ( (*c = fgetc(f)) != EOF ) { + + if ( isspace(*c) ) { + + // end of in progress word + if ( cur_kw_len > 0 ) { + + idkws = realloc(idkws, (*nidkws + 1) * sizeof(char*)); + assert( idkws != NULL ); + + idkws[*nidkws] = realloc(cur_kw, cur_kw_len+1); + idkws[*nidkws][cur_kw_len] = 0; + (*nidkws)++; + + cur_kw = NULL; + cur_kw_len = 0; + } + + } else if ( *c == '_' || isalnum(*c) ) { /* identifiers / keywords */ + + cur_kw = realloc(cur_kw, cur_kw_len + 1); + assert( cur_kw != NULL ); + + cur_kw[cur_kw_len] = *c; + cur_kw_len++; + + } else { /* anything else breaks */ + + break; + } + } + + // store any last in progress word + if ( cur_kw_len > 0 ) { + + idkws = realloc(idkws, (*nidkws + 1) * sizeof(char*)); + assert( idkws != NULL ); + + idkws[*nidkws] = realloc(cur_kw, cur_kw_len+1); + idkws[*nidkws][cur_kw_len] = 0; + (*nidkws)++; + } + + return idkws; +} + +int64_t parse_c_header_int(int *c, FILE *f) { + + size_t n=0; + char buf[32]; + + while ( (*c = fgetc(f)) != EOF ) { + + if ( n >= sizeof(buf) ) return INT64_MAX; + if ( !isdigit(*c) ) break; + + buf[n] = *c; + n++; + } + + buf[n] = 0; + return strtol(buf, NULL, 0); +} + +#if 0 +typedef struct { + size_t n; + char ** strs; +} arg_t; + +arg_t * parse_c_header_args(size_t *nargs, FILE *f) { + + *nargs = 0; + arg_t *args = NULL; + + int c; + while ( (c = fgetc(f)) != EOF ) { + + // consume anything that starts with '#' until EOF + if ( c == '#' ) { + while ( (c = fgetc(f)) != EOF && c != '\n' ); + if ( c == EOF ) break; + } + + ungetc(c, f); + + int idkw_end; + size_t nidkws = 0; + char **idkws = parse_c_header_identkws(&nidkws, &idkw_end, f); + + switch (idkw_end) { + default: + fprintf(stdout, "Unhandled '%c'\n", idkw_end); + goto ERROR; + case ')': break; + case ',': break; + case '*': break; + case '[': + printf("[%ld]\n", parse_c_header_int(&idkw_end, f)); + break; + } + + if ( nidkws > 0 ) { + args = realloc(args, (*nargs + 1) * sizeof(arg_t)); + args[*nargs].n = nidkws; + args[*nargs].strs = idkws; + (*nargs)++; + } + } + +EXIT: + return args; +ERROR: + *nargs = 0; + return NULL; +} + +typedef struct stmt_s { + enum stmt_type { + STMT_TYPE_TYPE, + STMT_TYPE_VAR, + STMT_TYPE_FUNC, + } type; + + size_t n; + char ** strs; +} stmt_t; + +stmt_t parse_c_header_stmt(FILE *f) { + + stmt_t stmt = { .n = 0, .strs = 0 }; + + int c; + while ( (c = fgetc(f)) != EOF ) { + + // consume anything that starts with '#' until EOF + if ( c == '#' ) { + while ( (c = fgetc(f)) != EOF && c != '\n' ); + if ( c == EOF ) break; + } + + int idkw_end; + size_t nidkws = 0; + char **idkws = parse_c_header_identkws(&nidkws, &idkw_end, f); + + switch (idkw_end) { + default: + break; + case '(': + case ')': + } + } +} +#endif + +char * parse_c_header_decl_body(int *c, size_t *len, FILE *f) { + + *len = 0; + char * s = NULL; + size_t level = 1; + + while ( (*c = fgetc(f)) != EOF && (*c != '}' || level > 1) ) { + + if ( *c == '{' ) level++; + else if ( *c == '}' ) level--; + + s = realloc(s, *len + 1); + s[*len] = *c; + (*len)++; + } + + return s; +} + +/// Returns the length to the end of the next token delimited by delim starting +/// from the previous value of *p before the call. +size_t next_tok_len(char **p, char *p_end, char delim) { + + if ( p == NULL || *p > p_end ) return 0; + + char *end = memchr(*p, delim, p_end - *p); + + if ( end == NULL ) { + if ( *p >= p_end ) return 0; + // if any remains return it as the last word + end = p_end; + } + + size_t len = end - *p; + *p = end + 1; + + return len; +} + +/// Get pointer to and length of the last identifier word in statement string s +char *parse_c_last_word(size_t *lwlen, char *s) { + + int64_t end_idx = -1, start_idx = -1; + + int got_first = 0; + + for (int64_t n = strlen(s) - 1; n >= 0; n--) { + + /* + if ( s[n] == '[' || s[n] == ']' ) { + *lwlen = 0; + return NULL; + } + */ + + // if we reach a non-identifier char while in a possible identifier + if ( end_idx > 0 && !isalnum(s[n]) && s[n] != '_' ) { + + // exit if we had a valid identifier (must start with [a-zA-Z_]) + if ( got_first && start_idx > 0 ) break; + + got_first = 0; + start_idx = -1; + end_idx = -1; + } + + // if in possible identifier + if ( end_idx > 0 ) { + + got_first = isalpha(s[n]) || s[n] == '_'; + start_idx = n; + + // start of possible identifier + } else if ( isalnum(s[n]) || s[n] == '_' ) { + + end_idx = n + 1; + } + } + + // nothing found + if ( !got_first || end_idx < 0 || start_idx < 0 ) { + *lwlen = 0; + return NULL; + } + + *lwlen = end_idx - start_idx; + return &s[start_idx]; +} + +char ** parse_c_header(size_t *ndecl, FILE *f) { + + *ndecl = 0; + char ** decl = NULL; + + size_t slen = 0; + char * s = NULL; + + int64_t array_count = 0, peak_array_count = 0; + + int c; + while ( (c = fgetc(f)) != EOF ) { + + // consume anything that starts with '#' until EOF + if ( c == '#' ) { + while ( (c = fgetc(f)) != EOF && c != '\n' ); + if ( c == EOF ) break; + } + + if ( c == '[' ) { + array_count++; + if ( array_count > peak_array_count ) peak_array_count = array_count; + continue; + } + if ( c == ']' ) { + array_count--; + continue; + } + + // if we are in an array definition we must skip over it to avoid + // tripping up on operator keywords inside its size expression (e.g. sizeof) + if ( array_count != 0 ) continue; + + // consume any function or struct body definitions + if ( c == '{' ) { + + size_t body_len = 0; + char * body = parse_c_header_decl_body(&c, &body_len, f); + + //fwrite(body, 1, body_len, stdout); + //putchar('\n'); + + // We need to search leading keywords to see if struct/union is found + int found_struct = 0; + int found_union = 0; + + size_t word_len = 0; + char *p = s, *prev_p = s; + while ( (word_len = next_tok_len(&p, s + slen, ' ')) != 0 ) { + + if ( word_len == (sizeof("struct")-1) && 0 == strncmp("struct", prev_p, word_len) ) { + found_struct = 1; + break; + } + + if ( word_len == (sizeof("union")-1) && 0 == strncmp("union", prev_p, word_len) ) { + found_union = 1; + break; + } + + prev_p = p; + } + + // IF and ONLY IF we are defining a struct/union preserve the body + if ( (found_struct || found_union) && NULL == memchr(s, '(', slen) ) { + + // handle body definitions recursively + FILE *fsub = fmemopen(body, body_len, "r"); + + size_t n_sub_decl = 0; + char ** sub_decl = parse_c_header(&n_sub_decl, fsub); + + /* + for (size_t n=0; n < n_sub_decl; n++) { + + size_t word_len = 0; + char *p = sub_decl[n], *prev_p = sub_decl[n]; + + fputs(" > ", stdout); + while ( (word_len = next_tok_len(&p, sub_decl[n] + strlen(sub_decl[n]), ' ')) != 0 ) { + putchar('"'); + fwrite(prev_p, 1, word_len, stdout); + putchar('"'); + putchar(' '); + prev_p = p; + } + putchar('\n'); + } + puts("----------"); + */ + + s = realloc(s, slen+1); + s[slen] = '{'; + slen++; + + for (size_t n=0; n < n_sub_decl; n++) { + + size_t lwlen = 0; + char * lastword = parse_c_last_word(&lwlen, sub_decl[n]); + + if ( lastword == NULL ) { + fprintf(stderr, "ERROR: identifier not found in s=\"%s\" body=", sub_decl[n]); + fwrite(body, 1, body_len, stdout); + putchar('\n'); + continue; + } + + size_t type_len = (lastword - sub_decl[n]); + size_t type_array_len = ((sub_decl[n] + strlen(sub_decl[n])) - (lastword + lwlen)); + + s = realloc(s, slen + type_len + type_array_len); + + memcpy(s + slen, sub_decl[n], type_len); + slen += type_len; + + memcpy(s + slen, lastword + lwlen, type_array_len); + slen += type_array_len; + + s = realloc(s, slen + 2 + lwlen); + s[slen] = ','; + slen++; + + memcpy(s + slen, lastword, lwlen); + slen += lwlen; + + s[slen] = ';'; + slen++; + + //fputs("@@@ ", stdout); + //fwrite(lastword, 1, lwlen, stdout); + //putchar('\n'); + } + //puts("----------"); + + s = realloc(s, slen+1); + s[slen] = '}'; + slen++; + + /* + s = realloc(s, slen + body_len); + memcpy(s + slen, body, body_len); + slen += body_len; + */ + } + + if ( body != NULL ) free( body ); + + if ( c == EOF ) break; + continue; + } + + // consume any static variable assignments + if ( c == '=' ) { + while ( (c = fgetc(f)) != EOF && c != ';' ); + if ( c == EOF ) break; + } + + if ( c == '(' ) { + + // remove any space between the function identifier and '(' + if ( slen > 0 && s[slen-1] == ' ' ) slen--; + + s = realloc(s, slen + 1); + s[slen] = '('; + slen++; + + while ( (c = fgetc(f)) != EOF && c != ')' ) { + + if ( !isspace(c) ) { + + s = realloc(s, slen + 1); + s[slen] = c; + slen++; + + } else if ( slen > 0 && s[slen-1] != ' ' ) { + + s = realloc(s, slen + 1); + s[slen] = ' '; + slen++; + } + } + if ( c == EOF ) break; + + s = realloc(s, slen + 2); + // add extra ',' for parsing consistency + s[slen] = ','; + slen++; + s[slen] = ')'; + slen++; + + // consume remainder of function prototype definition + while ( (c = fgetc(f)) != EOF && c != ';' ); + if ( c == EOF ) break; + } + + // end of declaration + if ( c == ';' || c == '}' ) { + + s = realloc(s, slen + 2 * peak_array_count + 1); + + for (size_t n=0; n < peak_array_count; n++) { + + s[slen] = '['; + slen++; + s[slen] = ']'; + slen++; + } + + s[slen] = 0; + slen++; + + decl = realloc(decl, (*ndecl + 1) * sizeof(char*)); + decl[*ndecl] = s; + (*ndecl)++; + + //fwrite(s, 1, slen, stdout); + //putchar('\n'); + + s = NULL; + slen = 0; + continue; + } + + if ( isspace(c) ) { + // add space to the buffer but de-duplicate any repeats + if ( slen != 0 && s[slen-1] != ' ' ) { + s = realloc(s, slen + 1); + s[slen] = ' '; + slen++; + } + continue; + } + + s = realloc(s, slen + 1); + s[slen] = c; + slen++; + + //putchar(c); + } + + return decl; +} + +int main(int argc, char *argv[]) { + + if ( argc <= 1 ) return 1; + + FILE *fhdr = open_c_header("cc", argv[1]); + + if ( fhdr == NULL ) { + perror(argv[1]); + return 1; + } + + size_t ndecl = 0; + char **decl = parse_c_header(&ndecl, fhdr); + + for (size_t n=0; n < ndecl; n++) { + puts(decl[n]); + } + +#if 0 + enum chp_state { + CHP_STATE_KEYWORD, + CHP_STATE_FUNC_ARGS, + } state = CHP_STATE_KEYWORD; + + // TODO we may need to handle "" and '' containing '{' or '}' + size_t cb_level = 0; + + int c; + while ( (c = fgetc(fhdr)) != EOF ) { + + // consume anything that starts with '#' until EOF + if ( c == '#' ) { + while ( (c = fgetc(fhdr)) != EOF && c != '\n' ); + if ( c == EOF ) break; + } + + if ( c == '{' ) { cb_level++; } + + if ( c == '}' ) { + cb_level--; + continue; + } + + // basically just skip anything in curly braces + if ( cb_level > 0 ) continue; + + //if ( isspace(c) ) continue; + + //putchar(c); + + if ( c == '_' || isalnum(c) ) { + + // need to put the char back before call to avoid loosing the first char + ungetc(c, fhdr); + + int idkw_end; + size_t nidkws = 0; + + char **idkws = parse_c_header_identkws(&nidkws, &idkw_end, fhdr); + for (size_t n=0; n < nidkws; n++) puts(idkws[n]); + + switch (idkw_end) { + default: + printf("!!! UNHANDLED '%c'\n", idkw_end); + break; + case '#': + case '{': + ungetc(idkw_end, fhdr); + break; + case ';': break; + case ',': break; + case '(': + /* + while ( idkw_end != ')' ) { + + char **idkws = parse_c_header_identkws(&nidkws, &idkw_end, fhdr); + for (size_t n=0; n < nidkws; n++) { + putchar(' '); + fputs(idkws[n], stdout); + } + + if ( idkw_end == '[' ) printf("[%ld]", parse_c_header_int(&idkw_end, fhdr)); + + if ( nidkws == 0 ) printf("!!! '%c'", idkw_end); + + if ( idkw_end != '*' ) putchar('\n'); else fputs(" *", stdout); + } + */ + { + size_t nargs; + arg_t *args = parse_c_header_args(&nargs, fhdr); + + for (size_t n=0; n < nargs; n++) { + puts("***"); + for (size_t nn=0; nn < args[n].n; nn++) { + printf(" %s", args[n].strs[nn]); + } + } + } + // consume the rest + while ( (c = fgetc(fhdr)) != ';' ); + state = CHP_STATE_FUNC_ARGS; + break; + case ')': + state = CHP_STATE_KEYWORD; + break; + case '[': + printf("[%ld]\n", parse_c_header_int(&idkw_end, fhdr)); + break; + } + + printf("----------> '%c'\n", idkw_end); + } + + //if ( c == ';' ) putchar('\n'); + } +#endif + + return 0; +}