lang_experiment/c-hdr-parser.c

578 lines
15 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <stdint.h>
/// realloc s1 and append s2 to it (modifies only s1)
char *cstr_append(char *s1, const char *s2) {
s1 = realloc(s1, strlen(s1) + strlen(s2) + 1);
if ( s1 == NULL ) return NULL;
strcat(s1, s2);
return s1;
}
/**
* Open a stream of the desired header (after 1st stage processing of the given CC program)
* which includes all other headers it also includes and in addition the compiler evaluates
* all macros and cleans out all comments for us.
*/
FILE *open_c_header(const char *cc_cmd, const char *hdr_fn) {
char *cmd = strdup("echo \"#include <");
cmd = cstr_append(cmd, hdr_fn);
if ( cmd == NULL ) return NULL;
cmd = cstr_append(cmd, ">\" | ");
if ( cmd == NULL ) return NULL;
cmd = cstr_append(cmd, cc_cmd);
if ( cmd == NULL ) return NULL;
cmd = cstr_append(cmd, " -I . -E -");
if ( cmd == NULL ) return NULL;
FILE *f = popen(cmd, "r");
free(cmd);
return f;
}
/// Parse a series of keywords/identifiers until a non-kw/id is found indicating the end
char ** parse_c_header_identkws(size_t *nidkws, int *c, FILE *f) {
*nidkws = 0;
char ** idkws = NULL;
size_t cur_kw_len = 0;
char * cur_kw = NULL;
while ( (*c = fgetc(f)) != EOF ) {
if ( isspace(*c) ) {
// end of in progress word
if ( cur_kw_len > 0 ) {
idkws = realloc(idkws, (*nidkws + 1) * sizeof(char*));
assert( idkws != NULL );
idkws[*nidkws] = realloc(cur_kw, cur_kw_len+1);
idkws[*nidkws][cur_kw_len] = 0;
(*nidkws)++;
cur_kw = NULL;
cur_kw_len = 0;
}
} else if ( *c == '_' || isalnum(*c) ) { /* identifiers / keywords */
cur_kw = realloc(cur_kw, cur_kw_len + 1);
assert( cur_kw != NULL );
cur_kw[cur_kw_len] = *c;
cur_kw_len++;
} else { /* anything else breaks */
break;
}
}
// store any last in progress word
if ( cur_kw_len > 0 ) {
idkws = realloc(idkws, (*nidkws + 1) * sizeof(char*));
assert( idkws != NULL );
idkws[*nidkws] = realloc(cur_kw, cur_kw_len+1);
idkws[*nidkws][cur_kw_len] = 0;
(*nidkws)++;
}
return idkws;
}
char * parse_c_header_decl_body(int *c, size_t *len, FILE *f) {
*len = 0;
char * s = NULL;
size_t level = 1;
while ( (*c = fgetc(f)) != EOF && (*c != '}' || level > 1) ) {
if ( *c == '{' ) level++;
else if ( *c == '}' ) level--;
s = realloc(s, *len + 1);
s[*len] = *c;
(*len)++;
}
return s;
}
char * parse_c_header_decl_args(int *c, size_t *len, FILE *f) {
*len = 0;
char * s = NULL;
size_t level = 1;
while ( (*c = fgetc(f)) != EOF && (*c != ')' || level > 1) ) {
if ( *c == '(' ) level++;
else if ( *c == ')' ) level--;
s = realloc(s, *len + 1);
s[*len] = (*c != ',') ? *c : ';';
(*len)++;
}
s = realloc(s, *len + 1);
s[*len] = ';';
(*len)++;
//fwrite(s, 1, *len, stdout);
//putchar('\n');
return s;
}
/// Returns the length to the end of the next token delimited by delim starting
/// from the previous value of *p before the call.
size_t next_tok_len(char **p, char *p_end, char delim) {
if ( p == NULL || *p > p_end ) return 0;
char *end = memchr(*p, delim, p_end - *p);
if ( end == NULL ) {
if ( *p >= p_end ) return 0;
// if any remains return it as the last word
end = p_end;
}
size_t len = end - *p;
*p = end + 1;
return len;
}
/// Get pointer to and length of the last identifier word in statement string s
char *parse_c_last_word(size_t *lwlen, char *s) {
int64_t end_idx = -1, start_idx = -1;
int got_first = 0;
for (int64_t n = strlen(s) - 1; n >= 0; n--) {
/*
if ( s[n] == '[' || s[n] == ']' ) {
*lwlen = 0;
return NULL;
}
*/
// if we reach a non-identifier char while in a possible identifier
if ( end_idx > 0 && !isalnum(s[n]) && s[n] != '_' ) {
// exit if we had a valid identifier (must start with [a-zA-Z_])
if ( got_first && start_idx > 0 ) break;
got_first = 0;
start_idx = -1;
end_idx = -1;
}
// if in possible identifier
if ( end_idx > 0 ) {
got_first = isalpha(s[n]) || s[n] == '_';
start_idx = n;
// start of possible identifier
} else if ( isalnum(s[n]) || s[n] == '_' ) {
end_idx = n + 1;
}
}
// nothing found
if ( !got_first || end_idx < 0 || start_idx < 0 ) {
*lwlen = 0;
return NULL;
}
*lwlen = end_idx - start_idx;
return &s[start_idx];
}
char ** parse_c_header(size_t *ndecl, FILE *f);
/// Parse sub expression from buffer
char * parse_c_header_sub(char *out, size_t *out_len, const char *sub, size_t sub_len) {
FILE *fsub = fmemopen((void*)sub, sub_len, "r");
size_t n_sub_decl = 0;
char ** sub_decl = parse_c_header(&n_sub_decl, fsub);
for (size_t n=0; n < n_sub_decl; n++) {
size_t lwlen = 0;
char * lastword = parse_c_last_word(&lwlen, sub_decl[n]);
//assert( lastword != NULL );
if ( lastword == NULL ) {
lastword = sub_decl[n] + strlen(sub_decl[n]);
lwlen = 0;
}
size_t type_len = (lastword - sub_decl[n]);
size_t type_array_len = ((sub_decl[n] + strlen(sub_decl[n])) - (lastword + lwlen));
out = realloc(out, (*out_len) + type_len + type_array_len + 1);
// add type identifiers and pointers
memcpy(out + (*out_len), sub_decl[n], type_len);
(*out_len) += type_len;
// add array []'s
memcpy(out + (*out_len), lastword + lwlen, type_array_len);
(*out_len) += type_array_len;
// NOTE: temporarily null terminate for calling parse_c_last_word again
out[*out_len] = 0;
size_t type_offset = *out_len - type_len - type_array_len;
// check if there are any words in the type string. If not, lastword is part of type instead
size_t type_lwlen;
char * type_lastword = parse_c_last_word(&type_lwlen, out + type_offset);
//fwrite(type_lastword, 1, type_lwlen, stdout);
//putchar('\n');
out = realloc(out, (*out_len) + 2 + lwlen);
// if lastword is a variable identifier
if ( type_lastword != NULL ) {
out[(*out_len)] = ',';
(*out_len)++;
}
if ( lwlen > 0 ) {
// add identifier
memcpy(out + (*out_len), lastword, lwlen);
(*out_len) += lwlen;
}
// if lastword is part of the type
if ( type_lastword == NULL ) {
out[(*out_len)] = ',';
(*out_len)++;
}
out[(*out_len)] = ';';
(*out_len)++;
}
fclose(fsub);
return out;
}
char ** parse_c_header(size_t *ndecl, FILE *f) {
*ndecl = 0;
char ** decl = NULL;
size_t slen = 0;
char * s = NULL;
int64_t array_count = 0, peak_array_count = 0;
char *func_name = NULL;
int c;
while ( (c = fgetc(f)) != EOF ) {
// consume anything that starts with '#' until EOF
if ( c == '#' ) {
while ( (c = fgetc(f)) != EOF && c != '\n' );
if ( c == EOF ) break;
}
if ( c == '[' ) {
array_count++;
if ( array_count > peak_array_count ) peak_array_count = array_count;
continue;
}
if ( c == ']' ) {
array_count--;
continue;
}
// if we are in an array definition we must skip over it to avoid
// tripping up on operator keywords inside its size expression (e.g. sizeof)
if ( array_count != 0 ) continue;
// consume any function or struct body definitions
if ( c == '{' ) {
size_t body_len = 0;
char * body = parse_c_header_decl_body(&c, &body_len, f);
// We need to search leading keywords to see if struct/union is found
int found_struct = 0;
int found_union = 0;
int found_enum = 0;
size_t word_len = 0;
char *p = s, *prev_p = s;
while ( (word_len = next_tok_len(&p, s + slen, ' ')) != 0 ) {
if ( word_len == (sizeof("struct")-1) && 0 == strncmp("struct", prev_p, word_len) ) {
found_struct = 1;
break;
}
if ( word_len == (sizeof("union")-1) && 0 == strncmp("union", prev_p, word_len) ) {
found_union = 1;
break;
}
if ( word_len == (sizeof("enum")-1) && 0 == strncmp("enum", prev_p, word_len) ) {
found_enum = 1;
break;
}
prev_p = p;
}
// IF and ONLY IF we are defining a struct/union preserve the body
if ( (found_struct || found_union) && NULL == memchr(s, '(', slen) ) {
s = realloc(s, slen+1);
s[slen] = '{';
slen++;
// parse declarations in recursive instance
s = parse_c_header_sub(s, &slen, body, body_len);
s = realloc(s, slen+1);
s[slen] = '}';
slen++;
}
if ( found_enum ) {
s = realloc(s, slen+1);
s[slen] = '{';
slen++;
// in C all enums store values as 'int'
static const char typestr[] = "int, ";
char * p = body;
char * end_ptr;
while ( (end_ptr = memchr(p, ',', body_len - (p - body))) != NULL ) {
char * tmpp = memchr(p, '=', body_len - (end_ptr - body));
if ( tmpp != NULL ) *tmpp = 0;
else *end_ptr = 0;
size_t lwlen;
char * lastword = parse_c_last_word(&lwlen, p);
s = realloc(s, slen + (sizeof(typestr)-1) + lwlen + 1);
memcpy(s + slen, typestr, sizeof(typestr)-1);
slen += (sizeof(typestr)-1);
memcpy(s + slen, lastword, lwlen);
slen += lwlen;
s[slen] = ';';
slen++;
p = end_ptr + 1;
}
size_t lwlen;
char * lastword = parse_c_last_word(&lwlen, p);
if ( lastword != NULL ) {
s = realloc(s, slen + (sizeof(typestr)-1) + lwlen + 2);
memcpy(s + slen, typestr, sizeof(typestr)-1);
slen += (sizeof(typestr)-1);
memcpy(s + slen, lastword, lwlen);
slen += lwlen;
s[slen] = ';';
slen++;
}
s = realloc(s, slen+1);
s[slen] = '}';
slen++;
}
if ( body != NULL ) free( body );
if ( c == EOF ) break;
continue;
}
// consume any static variable assignments
if ( c == '=' ) {
while ( (c = fgetc(f)) != EOF && c != ';' );
if ( c == EOF ) break;
}
if ( c == '(' ) {
size_t args_len = 0;
char *args = parse_c_header_decl_args(&c, &args_len, f);
int is_func_ptr = 0;
for (size_t n=0; n < args_len; n++) {
if ( args[n] == '*' ) is_func_ptr = 1;
if ( !isspace(args[n]) ) break;
}
if ( is_func_ptr ) {
while ( (c = fgetc(f)) != EOF && c != '(' );
if ( c == EOF ) {
free( args );
break;
}
ungetc(c, f);
// if we are not already in a func pointer decl, set the name
if ( func_name == NULL ) {
size_t lwlen;
char *lastword = parse_c_last_word(&lwlen, args);
if ( lastword ) {
func_name = malloc( lwlen + 1 );
memcpy(func_name, lastword, lwlen);
func_name[lwlen] = 0;
}
free( args );
}
continue;
}
// remove any space between the function identifier and '('
if ( slen > 0 && s[slen-1] == ' ' ) slen--;
//fwrite(s, 1, slen, stdout);
//putchar('\n');
s = realloc(s, slen + 1);
s[slen] = '(';
slen++;
s = parse_c_header_sub(s, &slen, args, args_len);
free( args );
s = realloc(s, slen + 1);
s[slen] = ')';
slen++;
if ( func_name ) {
size_t func_name_len = strlen(func_name);
s = realloc(s, slen + 1 + func_name_len);
s[slen] = ' ';
slen++;
memcpy(s + slen, func_name, func_name_len);
slen += func_name_len;
free( func_name );
func_name = NULL;
}
// consume remainder of function prototype definition
while ( (c = fgetc(f)) != EOF && c != ';' && c != '{' );
if ( c == '{' ) {
// need to consume static function bodies here instead to avoid a weird bug
size_t body_len = 0;
char * body = parse_c_header_decl_body(&c, &body_len, f);
free( body );
}
if ( c == EOF ) break;
}
// end of declaration
if ( c == ';' || c == '}' ) {
s = realloc(s, slen + 2 * peak_array_count + 1);
for (size_t n=0; n < peak_array_count; n++) {
s[slen] = '[';
slen++;
s[slen] = ']';
slen++;
}
s[slen] = 0;
slen++;
decl = realloc(decl, (*ndecl + 1) * sizeof(char*));
decl[*ndecl] = s;
(*ndecl)++;
s = NULL;
slen = 0;
continue;
}
if ( isspace(c) ) {
// add space to the buffer but de-duplicate any repeats
if ( slen != 0 && s[slen-1] != ' ' ) {
s = realloc(s, slen + 1);
s[slen] = ' ';
slen++;
}
continue;
}
s = realloc(s, slen + 1);
s[slen] = c;
slen++;
}
return decl;
}
int main(int argc, char *argv[]) {
if ( argc <= 1 ) return 1;
const char *cmd = getenv("CC");
FILE *fhdr = open_c_header(((cmd == NULL) ? "cc" : cmd), argv[1]);
if ( fhdr == NULL ) {
perror(argv[1]);
return 1;
}
size_t ndecl = 0;
char **decl = parse_c_header(&ndecl, fhdr);
for (size_t n=0; n < ndecl; n++) {
puts(decl[n]);
}
return 0;
}