wip system c header parser
This commit is contained in:
parent
d8c36904b6
commit
be69ebfaf5
1 changed files with 655 additions and 0 deletions
655
c-hdr-parser.c
Normal file
655
c-hdr-parser.c
Normal file
|
@ -0,0 +1,655 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/// realloc s1 and append s2 to it (modifies only s1)
|
||||
char *cstr_append(char *s1, const char *s2) {
|
||||
s1 = realloc(s1, strlen(s1) + strlen(s2) + 1);
|
||||
if ( s1 == NULL ) return NULL;
|
||||
strcat(s1, s2);
|
||||
return s1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open a stream of the desired header (after 1st stage processing of the given CC program)
|
||||
* which includes all other headers it also includes and in addition the compiler evaluates
|
||||
* all macros and cleans out all comments for us.
|
||||
*/
|
||||
FILE *open_c_header(const char *cc_cmd, const char *hdr_fn) {
|
||||
|
||||
char *cmd = strdup("echo \"#include <");
|
||||
cmd = cstr_append(cmd, hdr_fn);
|
||||
if ( cmd == NULL ) return NULL;
|
||||
cmd = cstr_append(cmd, ">\" | ");
|
||||
if ( cmd == NULL ) return NULL;
|
||||
cmd = cstr_append(cmd, cc_cmd);
|
||||
if ( cmd == NULL ) return NULL;
|
||||
cmd = cstr_append(cmd, " -E -");
|
||||
if ( cmd == NULL ) return NULL;
|
||||
|
||||
FILE *f = popen(cmd, "r");
|
||||
|
||||
free(cmd);
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
/// Parse a series of keywords/identifiers until a non-kw/id is found indicating the end
|
||||
char ** parse_c_header_identkws(size_t *nidkws, int *c, FILE *f) {
|
||||
|
||||
*nidkws = 0;
|
||||
char ** idkws = NULL;
|
||||
|
||||
size_t cur_kw_len = 0;
|
||||
char * cur_kw = NULL;
|
||||
|
||||
while ( (*c = fgetc(f)) != EOF ) {
|
||||
|
||||
if ( isspace(*c) ) {
|
||||
|
||||
// end of in progress word
|
||||
if ( cur_kw_len > 0 ) {
|
||||
|
||||
idkws = realloc(idkws, (*nidkws + 1) * sizeof(char*));
|
||||
assert( idkws != NULL );
|
||||
|
||||
idkws[*nidkws] = realloc(cur_kw, cur_kw_len+1);
|
||||
idkws[*nidkws][cur_kw_len] = 0;
|
||||
(*nidkws)++;
|
||||
|
||||
cur_kw = NULL;
|
||||
cur_kw_len = 0;
|
||||
}
|
||||
|
||||
} else if ( *c == '_' || isalnum(*c) ) { /* identifiers / keywords */
|
||||
|
||||
cur_kw = realloc(cur_kw, cur_kw_len + 1);
|
||||
assert( cur_kw != NULL );
|
||||
|
||||
cur_kw[cur_kw_len] = *c;
|
||||
cur_kw_len++;
|
||||
|
||||
} else { /* anything else breaks */
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// store any last in progress word
|
||||
if ( cur_kw_len > 0 ) {
|
||||
|
||||
idkws = realloc(idkws, (*nidkws + 1) * sizeof(char*));
|
||||
assert( idkws != NULL );
|
||||
|
||||
idkws[*nidkws] = realloc(cur_kw, cur_kw_len+1);
|
||||
idkws[*nidkws][cur_kw_len] = 0;
|
||||
(*nidkws)++;
|
||||
}
|
||||
|
||||
return idkws;
|
||||
}
|
||||
|
||||
int64_t parse_c_header_int(int *c, FILE *f) {
|
||||
|
||||
size_t n=0;
|
||||
char buf[32];
|
||||
|
||||
while ( (*c = fgetc(f)) != EOF ) {
|
||||
|
||||
if ( n >= sizeof(buf) ) return INT64_MAX;
|
||||
if ( !isdigit(*c) ) break;
|
||||
|
||||
buf[n] = *c;
|
||||
n++;
|
||||
}
|
||||
|
||||
buf[n] = 0;
|
||||
return strtol(buf, NULL, 0);
|
||||
}
|
||||
|
||||
#if 0
|
||||
typedef struct {
|
||||
size_t n;
|
||||
char ** strs;
|
||||
} arg_t;
|
||||
|
||||
arg_t * parse_c_header_args(size_t *nargs, FILE *f) {
|
||||
|
||||
*nargs = 0;
|
||||
arg_t *args = NULL;
|
||||
|
||||
int c;
|
||||
while ( (c = fgetc(f)) != EOF ) {
|
||||
|
||||
// consume anything that starts with '#' until EOF
|
||||
if ( c == '#' ) {
|
||||
while ( (c = fgetc(f)) != EOF && c != '\n' );
|
||||
if ( c == EOF ) break;
|
||||
}
|
||||
|
||||
ungetc(c, f);
|
||||
|
||||
int idkw_end;
|
||||
size_t nidkws = 0;
|
||||
char **idkws = parse_c_header_identkws(&nidkws, &idkw_end, f);
|
||||
|
||||
switch (idkw_end) {
|
||||
default:
|
||||
fprintf(stdout, "Unhandled '%c'\n", idkw_end);
|
||||
goto ERROR;
|
||||
case ')': break;
|
||||
case ',': break;
|
||||
case '*': break;
|
||||
case '[':
|
||||
printf("[%ld]\n", parse_c_header_int(&idkw_end, f));
|
||||
break;
|
||||
}
|
||||
|
||||
if ( nidkws > 0 ) {
|
||||
args = realloc(args, (*nargs + 1) * sizeof(arg_t));
|
||||
args[*nargs].n = nidkws;
|
||||
args[*nargs].strs = idkws;
|
||||
(*nargs)++;
|
||||
}
|
||||
}
|
||||
|
||||
EXIT:
|
||||
return args;
|
||||
ERROR:
|
||||
*nargs = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
typedef struct stmt_s {
|
||||
enum stmt_type {
|
||||
STMT_TYPE_TYPE,
|
||||
STMT_TYPE_VAR,
|
||||
STMT_TYPE_FUNC,
|
||||
} type;
|
||||
|
||||
size_t n;
|
||||
char ** strs;
|
||||
} stmt_t;
|
||||
|
||||
stmt_t parse_c_header_stmt(FILE *f) {
|
||||
|
||||
stmt_t stmt = { .n = 0, .strs = 0 };
|
||||
|
||||
int c;
|
||||
while ( (c = fgetc(f)) != EOF ) {
|
||||
|
||||
// consume anything that starts with '#' until EOF
|
||||
if ( c == '#' ) {
|
||||
while ( (c = fgetc(f)) != EOF && c != '\n' );
|
||||
if ( c == EOF ) break;
|
||||
}
|
||||
|
||||
int idkw_end;
|
||||
size_t nidkws = 0;
|
||||
char **idkws = parse_c_header_identkws(&nidkws, &idkw_end, f);
|
||||
|
||||
switch (idkw_end) {
|
||||
default:
|
||||
break;
|
||||
case '(':
|
||||
case ')':
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
char * parse_c_header_decl_body(int *c, size_t *len, FILE *f) {
|
||||
|
||||
*len = 0;
|
||||
char * s = NULL;
|
||||
size_t level = 1;
|
||||
|
||||
while ( (*c = fgetc(f)) != EOF && (*c != '}' || level > 1) ) {
|
||||
|
||||
if ( *c == '{' ) level++;
|
||||
else if ( *c == '}' ) level--;
|
||||
|
||||
s = realloc(s, *len + 1);
|
||||
s[*len] = *c;
|
||||
(*len)++;
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
/// Returns the length to the end of the next token delimited by delim starting
|
||||
/// from the previous value of *p before the call.
|
||||
size_t next_tok_len(char **p, char *p_end, char delim) {
|
||||
|
||||
if ( p == NULL || *p > p_end ) return 0;
|
||||
|
||||
char *end = memchr(*p, delim, p_end - *p);
|
||||
|
||||
if ( end == NULL ) {
|
||||
if ( *p >= p_end ) return 0;
|
||||
// if any remains return it as the last word
|
||||
end = p_end;
|
||||
}
|
||||
|
||||
size_t len = end - *p;
|
||||
*p = end + 1;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/// Get pointer to and length of the last identifier word in statement string s
|
||||
char *parse_c_last_word(size_t *lwlen, char *s) {
|
||||
|
||||
int64_t end_idx = -1, start_idx = -1;
|
||||
|
||||
int got_first = 0;
|
||||
|
||||
for (int64_t n = strlen(s) - 1; n >= 0; n--) {
|
||||
|
||||
/*
|
||||
if ( s[n] == '[' || s[n] == ']' ) {
|
||||
*lwlen = 0;
|
||||
return NULL;
|
||||
}
|
||||
*/
|
||||
|
||||
// if we reach a non-identifier char while in a possible identifier
|
||||
if ( end_idx > 0 && !isalnum(s[n]) && s[n] != '_' ) {
|
||||
|
||||
// exit if we had a valid identifier (must start with [a-zA-Z_])
|
||||
if ( got_first && start_idx > 0 ) break;
|
||||
|
||||
got_first = 0;
|
||||
start_idx = -1;
|
||||
end_idx = -1;
|
||||
}
|
||||
|
||||
// if in possible identifier
|
||||
if ( end_idx > 0 ) {
|
||||
|
||||
got_first = isalpha(s[n]) || s[n] == '_';
|
||||
start_idx = n;
|
||||
|
||||
// start of possible identifier
|
||||
} else if ( isalnum(s[n]) || s[n] == '_' ) {
|
||||
|
||||
end_idx = n + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// nothing found
|
||||
if ( !got_first || end_idx < 0 || start_idx < 0 ) {
|
||||
*lwlen = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*lwlen = end_idx - start_idx;
|
||||
return &s[start_idx];
|
||||
}
|
||||
|
||||
char ** parse_c_header(size_t *ndecl, FILE *f) {
|
||||
|
||||
*ndecl = 0;
|
||||
char ** decl = NULL;
|
||||
|
||||
size_t slen = 0;
|
||||
char * s = NULL;
|
||||
|
||||
int64_t array_count = 0, peak_array_count = 0;
|
||||
|
||||
int c;
|
||||
while ( (c = fgetc(f)) != EOF ) {
|
||||
|
||||
// consume anything that starts with '#' until EOF
|
||||
if ( c == '#' ) {
|
||||
while ( (c = fgetc(f)) != EOF && c != '\n' );
|
||||
if ( c == EOF ) break;
|
||||
}
|
||||
|
||||
if ( c == '[' ) {
|
||||
array_count++;
|
||||
if ( array_count > peak_array_count ) peak_array_count = array_count;
|
||||
continue;
|
||||
}
|
||||
if ( c == ']' ) {
|
||||
array_count--;
|
||||
continue;
|
||||
}
|
||||
|
||||
// if we are in an array definition we must skip over it to avoid
|
||||
// tripping up on operator keywords inside its size expression (e.g. sizeof)
|
||||
if ( array_count != 0 ) continue;
|
||||
|
||||
// consume any function or struct body definitions
|
||||
if ( c == '{' ) {
|
||||
|
||||
size_t body_len = 0;
|
||||
char * body = parse_c_header_decl_body(&c, &body_len, f);
|
||||
|
||||
//fwrite(body, 1, body_len, stdout);
|
||||
//putchar('\n');
|
||||
|
||||
// We need to search leading keywords to see if struct/union is found
|
||||
int found_struct = 0;
|
||||
int found_union = 0;
|
||||
|
||||
size_t word_len = 0;
|
||||
char *p = s, *prev_p = s;
|
||||
while ( (word_len = next_tok_len(&p, s + slen, ' ')) != 0 ) {
|
||||
|
||||
if ( word_len == (sizeof("struct")-1) && 0 == strncmp("struct", prev_p, word_len) ) {
|
||||
found_struct = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
if ( word_len == (sizeof("union")-1) && 0 == strncmp("union", prev_p, word_len) ) {
|
||||
found_union = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
prev_p = p;
|
||||
}
|
||||
|
||||
// IF and ONLY IF we are defining a struct/union preserve the body
|
||||
if ( (found_struct || found_union) && NULL == memchr(s, '(', slen) ) {
|
||||
|
||||
// handle body definitions recursively
|
||||
FILE *fsub = fmemopen(body, body_len, "r");
|
||||
|
||||
size_t n_sub_decl = 0;
|
||||
char ** sub_decl = parse_c_header(&n_sub_decl, fsub);
|
||||
|
||||
/*
|
||||
for (size_t n=0; n < n_sub_decl; n++) {
|
||||
|
||||
size_t word_len = 0;
|
||||
char *p = sub_decl[n], *prev_p = sub_decl[n];
|
||||
|
||||
fputs(" > ", stdout);
|
||||
while ( (word_len = next_tok_len(&p, sub_decl[n] + strlen(sub_decl[n]), ' ')) != 0 ) {
|
||||
putchar('"');
|
||||
fwrite(prev_p, 1, word_len, stdout);
|
||||
putchar('"');
|
||||
putchar(' ');
|
||||
prev_p = p;
|
||||
}
|
||||
putchar('\n');
|
||||
}
|
||||
puts("----------");
|
||||
*/
|
||||
|
||||
s = realloc(s, slen+1);
|
||||
s[slen] = '{';
|
||||
slen++;
|
||||
|
||||
for (size_t n=0; n < n_sub_decl; n++) {
|
||||
|
||||
size_t lwlen = 0;
|
||||
char * lastword = parse_c_last_word(&lwlen, sub_decl[n]);
|
||||
|
||||
if ( lastword == NULL ) {
|
||||
fprintf(stderr, "ERROR: identifier not found in s=\"%s\" body=", sub_decl[n]);
|
||||
fwrite(body, 1, body_len, stdout);
|
||||
putchar('\n');
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t type_len = (lastword - sub_decl[n]);
|
||||
size_t type_array_len = ((sub_decl[n] + strlen(sub_decl[n])) - (lastword + lwlen));
|
||||
|
||||
s = realloc(s, slen + type_len + type_array_len);
|
||||
|
||||
memcpy(s + slen, sub_decl[n], type_len);
|
||||
slen += type_len;
|
||||
|
||||
memcpy(s + slen, lastword + lwlen, type_array_len);
|
||||
slen += type_array_len;
|
||||
|
||||
s = realloc(s, slen + 2 + lwlen);
|
||||
s[slen] = ',';
|
||||
slen++;
|
||||
|
||||
memcpy(s + slen, lastword, lwlen);
|
||||
slen += lwlen;
|
||||
|
||||
s[slen] = ';';
|
||||
slen++;
|
||||
|
||||
//fputs("@@@ ", stdout);
|
||||
//fwrite(lastword, 1, lwlen, stdout);
|
||||
//putchar('\n');
|
||||
}
|
||||
//puts("----------");
|
||||
|
||||
s = realloc(s, slen+1);
|
||||
s[slen] = '}';
|
||||
slen++;
|
||||
|
||||
/*
|
||||
s = realloc(s, slen + body_len);
|
||||
memcpy(s + slen, body, body_len);
|
||||
slen += body_len;
|
||||
*/
|
||||
}
|
||||
|
||||
if ( body != NULL ) free( body );
|
||||
|
||||
if ( c == EOF ) break;
|
||||
continue;
|
||||
}
|
||||
|
||||
// consume any static variable assignments
|
||||
if ( c == '=' ) {
|
||||
while ( (c = fgetc(f)) != EOF && c != ';' );
|
||||
if ( c == EOF ) break;
|
||||
}
|
||||
|
||||
if ( c == '(' ) {
|
||||
|
||||
// remove any space between the function identifier and '('
|
||||
if ( slen > 0 && s[slen-1] == ' ' ) slen--;
|
||||
|
||||
s = realloc(s, slen + 1);
|
||||
s[slen] = '(';
|
||||
slen++;
|
||||
|
||||
while ( (c = fgetc(f)) != EOF && c != ')' ) {
|
||||
|
||||
if ( !isspace(c) ) {
|
||||
|
||||
s = realloc(s, slen + 1);
|
||||
s[slen] = c;
|
||||
slen++;
|
||||
|
||||
} else if ( slen > 0 && s[slen-1] != ' ' ) {
|
||||
|
||||
s = realloc(s, slen + 1);
|
||||
s[slen] = ' ';
|
||||
slen++;
|
||||
}
|
||||
}
|
||||
if ( c == EOF ) break;
|
||||
|
||||
s = realloc(s, slen + 2);
|
||||
// add extra ',' for parsing consistency
|
||||
s[slen] = ',';
|
||||
slen++;
|
||||
s[slen] = ')';
|
||||
slen++;
|
||||
|
||||
// consume remainder of function prototype definition
|
||||
while ( (c = fgetc(f)) != EOF && c != ';' );
|
||||
if ( c == EOF ) break;
|
||||
}
|
||||
|
||||
// end of declaration
|
||||
if ( c == ';' || c == '}' ) {
|
||||
|
||||
s = realloc(s, slen + 2 * peak_array_count + 1);
|
||||
|
||||
for (size_t n=0; n < peak_array_count; n++) {
|
||||
|
||||
s[slen] = '[';
|
||||
slen++;
|
||||
s[slen] = ']';
|
||||
slen++;
|
||||
}
|
||||
|
||||
s[slen] = 0;
|
||||
slen++;
|
||||
|
||||
decl = realloc(decl, (*ndecl + 1) * sizeof(char*));
|
||||
decl[*ndecl] = s;
|
||||
(*ndecl)++;
|
||||
|
||||
//fwrite(s, 1, slen, stdout);
|
||||
//putchar('\n');
|
||||
|
||||
s = NULL;
|
||||
slen = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( isspace(c) ) {
|
||||
// add space to the buffer but de-duplicate any repeats
|
||||
if ( slen != 0 && s[slen-1] != ' ' ) {
|
||||
s = realloc(s, slen + 1);
|
||||
s[slen] = ' ';
|
||||
slen++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
s = realloc(s, slen + 1);
|
||||
s[slen] = c;
|
||||
slen++;
|
||||
|
||||
//putchar(c);
|
||||
}
|
||||
|
||||
return decl;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
if ( argc <= 1 ) return 1;
|
||||
|
||||
FILE *fhdr = open_c_header("cc", argv[1]);
|
||||
|
||||
if ( fhdr == NULL ) {
|
||||
perror(argv[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
size_t ndecl = 0;
|
||||
char **decl = parse_c_header(&ndecl, fhdr);
|
||||
|
||||
for (size_t n=0; n < ndecl; n++) {
|
||||
puts(decl[n]);
|
||||
}
|
||||
|
||||
#if 0
|
||||
enum chp_state {
|
||||
CHP_STATE_KEYWORD,
|
||||
CHP_STATE_FUNC_ARGS,
|
||||
} state = CHP_STATE_KEYWORD;
|
||||
|
||||
// TODO we may need to handle "" and '' containing '{' or '}'
|
||||
size_t cb_level = 0;
|
||||
|
||||
int c;
|
||||
while ( (c = fgetc(fhdr)) != EOF ) {
|
||||
|
||||
// consume anything that starts with '#' until EOF
|
||||
if ( c == '#' ) {
|
||||
while ( (c = fgetc(fhdr)) != EOF && c != '\n' );
|
||||
if ( c == EOF ) break;
|
||||
}
|
||||
|
||||
if ( c == '{' ) { cb_level++; }
|
||||
|
||||
if ( c == '}' ) {
|
||||
cb_level--;
|
||||
continue;
|
||||
}
|
||||
|
||||
// basically just skip anything in curly braces
|
||||
if ( cb_level > 0 ) continue;
|
||||
|
||||
//if ( isspace(c) ) continue;
|
||||
|
||||
//putchar(c);
|
||||
|
||||
if ( c == '_' || isalnum(c) ) {
|
||||
|
||||
// need to put the char back before call to avoid loosing the first char
|
||||
ungetc(c, fhdr);
|
||||
|
||||
int idkw_end;
|
||||
size_t nidkws = 0;
|
||||
|
||||
char **idkws = parse_c_header_identkws(&nidkws, &idkw_end, fhdr);
|
||||
for (size_t n=0; n < nidkws; n++) puts(idkws[n]);
|
||||
|
||||
switch (idkw_end) {
|
||||
default:
|
||||
printf("!!! UNHANDLED '%c'\n", idkw_end);
|
||||
break;
|
||||
case '#':
|
||||
case '{':
|
||||
ungetc(idkw_end, fhdr);
|
||||
break;
|
||||
case ';': break;
|
||||
case ',': break;
|
||||
case '(':
|
||||
/*
|
||||
while ( idkw_end != ')' ) {
|
||||
|
||||
char **idkws = parse_c_header_identkws(&nidkws, &idkw_end, fhdr);
|
||||
for (size_t n=0; n < nidkws; n++) {
|
||||
putchar(' ');
|
||||
fputs(idkws[n], stdout);
|
||||
}
|
||||
|
||||
if ( idkw_end == '[' ) printf("[%ld]", parse_c_header_int(&idkw_end, fhdr));
|
||||
|
||||
if ( nidkws == 0 ) printf("!!! '%c'", idkw_end);
|
||||
|
||||
if ( idkw_end != '*' ) putchar('\n'); else fputs(" *", stdout);
|
||||
}
|
||||
*/
|
||||
{
|
||||
size_t nargs;
|
||||
arg_t *args = parse_c_header_args(&nargs, fhdr);
|
||||
|
||||
for (size_t n=0; n < nargs; n++) {
|
||||
puts("***");
|
||||
for (size_t nn=0; nn < args[n].n; nn++) {
|
||||
printf(" %s", args[n].strs[nn]);
|
||||
}
|
||||
}
|
||||
}
|
||||
// consume the rest
|
||||
while ( (c = fgetc(fhdr)) != ';' );
|
||||
state = CHP_STATE_FUNC_ARGS;
|
||||
break;
|
||||
case ')':
|
||||
state = CHP_STATE_KEYWORD;
|
||||
break;
|
||||
case '[':
|
||||
printf("[%ld]\n", parse_c_header_int(&idkw_end, fhdr));
|
||||
break;
|
||||
}
|
||||
|
||||
printf("----------> '%c'\n", idkw_end);
|
||||
}
|
||||
|
||||
//if ( c == ';' ) putchar('\n');
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in a new issue