From 5247342c38fce0f9161d8cd56835130263c3eb50 Mon Sep 17 00:00:00 2001 From: Mart Lubbers Date: Tue, 9 Feb 2021 11:24:50 +0100 Subject: [PATCH] make parser more robuust, add string literals and escapes --- ast.c | 142 ++++++++++++++++++++++++------------------------------ ast.h | 16 ++++-- input.txt | 4 +- parse.y | 15 +++--- scan.l | 6 +-- todo.txt | 2 - util.c | 65 +++++++++++++++++++++++++ util.h | 12 +++-- 8 files changed, 162 insertions(+), 100 deletions(-) delete mode 100644 todo.txt diff --git a/ast.c b/ast.c index 0a17870..597dfa6 100644 --- a/ast.c +++ b/ast.c @@ -101,6 +101,14 @@ struct stmt *stmt_expr(struct expr *expr) return res; } +struct stmt *stmt_vardecl(struct vardecl *vardecl) +{ + struct stmt *res = safe_malloc(sizeof(struct stmt)); + res->type = svardecl; + res->data.svardecl = vardecl; + return res; +} + struct stmt *stmt_while(struct expr *pred, struct list *body) { struct stmt *res = safe_malloc(sizeof(struct stmt)); @@ -128,52 +136,44 @@ struct expr *expr_bool(bool b) res->data.ebool = b; return res; } -int fromHex(char c) -{ - if (c >= '0' && c <= '9') - return c-'0'; - if (c >= 'a' && c <= 'f') - return c-'a'+10; - if (c >= 'A' && c <= 'F') - return c-'A'+10; - return -1; -} -struct expr *expr_char(const char *c) +struct expr *expr_char(char *c) { struct expr *res = safe_malloc(sizeof(struct expr)); res->type = echar; - //regular char - if (c[0] == '\'' && c[2] == '\'') - res->data.echar = c[1]; - //escape - else if (c[0] == '\'' && c[1] == '\\' && c[3] == '\'') - switch(c[2]) { - case '0': res->data.echar = '\0'; break; - case '\'': res->data.echar = '\''; break; - case '\\': res->data.echar = '\\'; break; - case 'a': res->data.echar = '\a'; break; - case 'b': res->data.echar = '\b'; break; - case 't': res->data.echar = '\t'; break; - case 'v': res->data.echar = '\v'; break; - case 'f': res->data.echar = '\f'; break; - case 'r': res->data.echar = '\r'; break; - } - //hex escape - else if (c[0] == '\'' && c[1] == '\\' && c[2] == 'x' && c[5] == '\'') - res->data.echar = (fromHex(c[3])<<4)+fromHex(c[4]); - else - die("malformed character: %s\n", c); + res->data.echar = unescape_char(c)[0]; return res; } -struct expr *expr_funcall(char *ident, struct list *args) +static void set_fields(enum fieldspec **farray, int *n, struct list *fields) +{ + void **els = list_to_array(fields, n, true); + *farray = (enum fieldspec *)safe_malloc(*n*sizeof(enum fieldspec)); + for (int i = 0; i<*n; i++) { + char *t = els[i]; + if (strcmp(t, "fst") == 0) + (*farray)[i] = fst; + else if (strcmp(t, "snd") == 0) + (*farray)[i] = snd; + else if (strcmp(t, "hd") == 0) + (*farray)[i] = hd; + else if (strcmp(t, "tl") == 0) + (*farray)[i] = tl; + free(t); + } + free(els); +} + + +struct expr *expr_funcall(char *ident, struct list *args, struct list *fields) { struct expr *res = safe_malloc(sizeof(struct expr)); res->type = efuncall; res->data.efuncall.ident = ident; res->data.efuncall.args = (struct expr **) list_to_array(args, &res->data.efuncall.nargs, true); + set_fields(&res->data.efuncall.fields, + &res->data.efuncall.nfields, fields); return res; } @@ -190,23 +190,7 @@ struct expr *expr_ident(char *ident, struct list *fields) struct expr *res = safe_malloc(sizeof(struct expr)); res->type = eident; res->data.eident.ident = ident; - - void **els = list_to_array(fields, &res->data.eident.nfields, true); - res->data.eident.fields = (enum fieldspec *)safe_malloc( - res->data.eident.nfields*sizeof(enum fieldspec)); - for (int i = 0; idata.eident.nfields; i++) { - char *t = els[i]; - if (strcmp(t, "fst") == 0) - res->data.eident.fields[i] = fst; - else if (strcmp(t, "snd") == 0) - res->data.eident.fields[i] = snd; - else if (strcmp(t, "hd") == 0) - res->data.eident.fields[i] = hd; - else if (strcmp(t, "tl") == 0) - res->data.eident.fields[i] = tl; - free(t); - } - free(els); + set_fields(&res->data.eident.fields, &res->data.eident.nfields, fields); return res; } @@ -230,9 +214,15 @@ struct expr *expr_string(char *str) { struct expr *res = safe_malloc(sizeof(struct expr)); res->type = estring; - res->data.estring = safe_strdup(str+1); - res->data.estring[strlen(res->data.estring)-1] = '\0'; - //TODO escapes + res->data.estring.nchar = 0; + res->data.estring.chars = safe_malloc(strlen(str)+1); + char *p = res->data.estring.chars; + while(*str != '\0') { + str = unescape_char(str); + *p++ = *str++; + res->data.estring.nchar++; + } + *p = '\0'; return res; } @@ -296,20 +286,6 @@ struct type *type_var(char *ident) return res; } - -const char *cescapes[] = { - [0] = "0", [1] = "x01", [2] = "x02", [3] = "x03", - [4] = "x04", [5] = "x05", [6] = "x06", [7] = "a", [8] = "b", - [9] = "t", [10] = "n", [11] = "v", [12] = "f", [13] = "r", - [14] = "x0E", [15] = "x0F", [16] = "x10", [17] = "x11", - [18] = "x12", [19] = "x13", [20] = "x14", [21] = "x15", - [22] = "x16", [23] = "x17", [24] = "x18", [25] = "x19", - [26] = "x1A", [27] = "x1B", [28] = "x1C", [29] = "x1D", - [30] = "x1E", [31] = "x1F", - ['\\'] = "\\", ['\''] = "'", - [127] = "x7F" -}; - void ast_print(struct ast *ast, FILE *out) { if (ast == NULL) @@ -408,6 +384,9 @@ void stmt_print(struct stmt *stmt, int indent, FILE *out) expr_print(stmt->data.sexpr, out); safe_fprintf(out, ";\n"); break; + case svardecl: + vardecl_print(stmt->data.svardecl, indent, out); + break; case swhile: pindent(indent, out); safe_fprintf(out, "while ("); @@ -428,6 +407,7 @@ void expr_print(struct expr *expr, FILE *out) { if (expr == NULL) return; + char buf[] = "\\xff"; switch(expr->type) { case ebinop: safe_fprintf(out, "("); @@ -440,16 +420,8 @@ void expr_print(struct expr *expr, FILE *out) safe_fprintf(out, "%s", expr->data.ebool ? "true" : "false"); break; case echar: - if (expr->data.echar < 0) { - safe_fprintf(out, "'?'"); - } else if (expr->data.echar < ' ' || expr->data.echar == 127 - || expr->data.echar == '\\' - || expr->data.echar == '\'') { - safe_fprintf(out, "'\\%s'", - cescapes[(int)expr->data.echar]); - } else { - safe_fprintf(out, "'%c'", expr->data.echar); - } + safe_fprintf(out, "'%s'", + escape_char(expr->data.echar, buf, false)); break; case efuncall: safe_fprintf(out, "%s(", expr->data.efuncall.ident); @@ -459,6 +431,9 @@ void expr_print(struct expr *expr, FILE *out) safe_fprintf(out, ", "); } safe_fprintf(out, ")"); + for (int i = 0; idata.efuncall.nfields; i++) + fprintf(out, ".%s", + fieldspec_str[expr->data.efuncall.fields[i]]); break; case eint: safe_fprintf(out, "%d", expr->data.eint); @@ -480,7 +455,11 @@ void expr_print(struct expr *expr, FILE *out) safe_fprintf(out, ")"); break; case estring: - safe_fprintf(out, "\"%s\"", expr->data.estring); + safe_fprintf(out, "\""); + for (int i = 0; idata.estring.nchar; i++) + safe_fprintf(out, "%s", escape_char( + expr->data.estring.chars[i], buf, true)); + safe_fprintf(out, "\""); break; case eunop: safe_fprintf(out, "(%s", unop_str[expr->data.eunop.op]); @@ -577,6 +556,7 @@ void stmt_free(struct stmt *stmt) free(stmt->data.sassign.ident); for (int i = 0; idata.sassign.nfield; i++) free(stmt->data.sassign.fields[i]); + free(stmt->data.sassign.fields); expr_free(stmt->data.sassign.expr); break; case sif: @@ -600,6 +580,9 @@ void stmt_free(struct stmt *stmt) stmt_free(stmt->data.swhile.body[i]); free(stmt->data.swhile.body); break; + case svardecl: + vardecl_free(stmt->data.svardecl); + break; default: die("Unsupported stmt node\n"); } @@ -623,6 +606,7 @@ void expr_free(struct expr *expr) free(expr->data.efuncall.ident); for (int i = 0; idata.efuncall.nargs; i++) expr_free(expr->data.efuncall.args[i]); + free(expr->data.efuncall.fields); free(expr->data.efuncall.args); break; case eint: @@ -638,7 +622,7 @@ void expr_free(struct expr *expr) expr_free(expr->data.etuple.right); break; case estring: - free(expr->data.estring); + free(expr->data.estring.chars); break; case eunop: expr_free(expr->data.eunop.l); diff --git a/ast.h b/ast.h index 37ac2d5..0332e84 100644 --- a/ast.h +++ b/ast.h @@ -53,7 +53,7 @@ struct decl { }; struct stmt { - enum {sassign, sif, sreturn, sexpr, swhile} type; + enum {sassign, sif, sreturn, sexpr, svardecl, swhile} type; union { struct { char *ident; @@ -68,6 +68,7 @@ struct stmt { int nels; struct stmt **els; } sif; + struct vardecl *svardecl; struct expr *sreturn; struct expr *sexpr; struct { @@ -99,6 +100,8 @@ struct expr { char *ident; int nargs; struct expr **args; + int nfields; + enum fieldspec *fields; } efuncall; int eint; struct { @@ -110,7 +113,10 @@ struct expr { struct expr *left; struct expr *right; } etuple; - char *estring; + struct { + int nchar; + char *chars; + } estring; struct { enum unop op; struct expr *l; @@ -130,13 +136,13 @@ struct stmt *stmt_assign(char *ident, struct list *fields, struct expr *expr); struct stmt *stmt_if(struct expr *pred, struct list *then, struct list *els); struct stmt *stmt_return(struct expr *rtrn); struct stmt *stmt_expr(struct expr *expr); -struct stmt *stmt_vardecl(struct vardecl vardecl); +struct stmt *stmt_vardecl(struct vardecl *vardecl); struct stmt *stmt_while(struct expr *pred, struct list *body); struct expr *expr_binop(struct expr *l, enum binop op, struct expr *r); struct expr *expr_bool(bool b); -struct expr *expr_char(const char *c); -struct expr *expr_funcall(char *ident, struct list *args); +struct expr *expr_char(char *c); +struct expr *expr_funcall(char *ident, struct list *args, struct list *fields); struct expr *expr_int(int integer); struct expr *expr_ident(char *ident, struct list *fields); struct expr *expr_nil(); diff --git a/input.txt b/input.txt index a723fc8..c1579d6 100644 --- a/input.txt +++ b/input.txt @@ -13,7 +13,9 @@ fun(x){ '\x01'; '\xaa'; "abr"; - + "a\br"; + "a\br\""; + "a\xaar\\"; return 5; f(); f(x); f(1, 2, []); diff --git a/parse.y b/parse.y index dc85dac..ee3a55a 100644 --- a/parse.y +++ b/parse.y @@ -81,21 +81,23 @@ funtype : /* empty */ { $$ = NULL; } | funtype ftype { $$ = list_cons($2, $1); } ; -ftype +/* don't allow vardecls to be fully polymorph, this complicates parsing a lot */ +type : BOPEN ftype COMMA ftype BCLOSE { $$ = type_tuple($2, $4); } | SOPEN ftype SCLOSE { $$ = type_list($2); } | TBOOL { $$ = type_basic(btbool); } | TCHAR { $$ = type_basic(btchar); } | TINT { $$ = type_basic(btint); } | TVOID { $$ = type_basic(btvoid); } - | IDENT { $$ = type_var($1); } -type - : BOPEN type COMMA type BCLOSE { $$ = type_tuple($2, $4); } - | SOPEN type SCLOSE { $$ = type_list($2); } + ; +ftype + : BOPEN ftype COMMA ftype BCLOSE { $$ = type_tuple($2, $4); } + | SOPEN ftype SCLOSE { $$ = type_list($2); } | TBOOL { $$ = type_basic(btbool); } | TCHAR { $$ = type_basic(btchar); } | TINT { $$ = type_basic(btint); } | TVOID { $$ = type_basic(btvoid); } + | IDENT { $$ = type_var($1); } ; args : /* empty */ { $$ = NULL; } @@ -132,6 +134,7 @@ stmt | IDENT field ASSIGN expr SEMICOLON { $$ = stmt_assign($1, $2, $4); } | RETURN expr SEMICOLON { $$ = stmt_return($2); } | RETURN SEMICOLON { $$ = stmt_return(NULL); } + | vardecl { $$ = stmt_vardecl($1); } | expr SEMICOLON { $$ = stmt_expr($1); } ; expr @@ -152,7 +155,7 @@ expr | expr POWER expr { $$ = expr_binop($1, power, $3); } | MINUS expr %prec TIMES { $$ = expr_unop(negate, $2); } | INVERSE expr %prec TIMES { $$ = expr_unop(inverse, $2); } - | IDENT BOPEN fargs BCLOSE { $$ = expr_funcall($1, $3); } + | IDENT BOPEN fargs BCLOSE field { $$ = expr_funcall($1, $3, $5); } | BOPEN expr COMMA expr BCLOSE { $$ = expr_tuple($2, $4); } | BOPEN expr BCLOSE { $$ = $2; } | INTEGER diff --git a/scan.l b/scan.l index 59dec5f..9337331 100644 --- a/scan.l +++ b/scan.l @@ -68,10 +68,10 @@ Void return TVOID; \[\] return NIL; \. return DOT; , return COMMA; -\"([^"]|\\\")*\" { - yylval.expr = expr_string(yytext); return STRING; } +\"([^\\"]|\\[\"0\\abtnvfr]|\\x[0-9a-fA-F][0-9a-fA-F])*\" { + yylval.expr = expr_string(trimquotes(yytext)); return STRING; } '([^\\']|\\['0\\abtnvfr]|\\x[0-9a-fA-F][0-9a-fA-F])' { - yylval.expr = expr_char(yytext); return CHAR; } + yylval.expr = expr_char(trimquotes(yytext)); return CHAR; } [0-9]+ { yylval.expr = expr_int(atoi(yytext)); return INTEGER; } [_a-zA-Z][_a-zA-Z0-9]* { diff --git a/todo.txt b/todo.txt deleted file mode 100644 index edf9d6f..0000000 --- a/todo.txt +++ /dev/null @@ -1,2 +0,0 @@ -parser: -- types diff --git a/util.c b/util.c index 4b77f53..91166fd 100644 --- a/util.c +++ b/util.c @@ -50,6 +50,71 @@ int list_length(struct list *r) return i; } +int fromHex(char c) +{ + if (c >= '0' && c <= '9') + return c-'0'; + if (c >= 'a' && c <= 'f') + return c-'a'+10; + if (c >= 'A' && c <= 'F') + return c-'A'+10; + return -1; +} + +char *escape_char(char c, char *buf, bool str) +{ + buf = buf == NULL ? safe_malloc(10) : buf; + switch (c) { + case '\0': strcpy(buf, "\\0"); break; + case '\a': strcpy(buf, "\\a"); break; + case '\b': strcpy(buf, "\\b"); break; + case '\t': strcpy(buf, "\\t"); break; + case '\n': strcpy(buf, "\\n"); break; + case '\v': strcpy(buf, "\\v"); break; + case '\f': strcpy(buf, "\\f"); break; + case '\r': strcpy(buf, "\\r"); break; + case '\'': strcpy(buf, str ? "'" : "\\'"); break; + case '"': strcpy(buf, str ? "\\\"" : "\""); break; + default: + if (c >= ' ' && c < 127) { + sprintf(buf, "%c", c); + } else { + sprintf(buf, "\\x%02x", (unsigned char)c); + } + break; + } + return buf; +} + +char *unescape_char(char *c) +{ + //escape + if (c[0] == '\\') { + switch (c[1]) { + case '0': c[1] = '\0'; break; + case '\'': c[1] = '\''; break; + case '\\': c[1] = '\\'; break; + case '"': c[1] = '"'; break; + case 'a': c[1] = '\a'; break; + case 'b': c[1] = '\b'; break; + case 't': c[1] = '\t'; break; + case 'v': c[1] = '\v'; break; + case 'f': c[1] = '\f'; break; + case 'r': c[1] = '\r'; break; + case 'x': c[3] = (fromHex(c[2])<<4)+fromHex(c[3]); c+=2; break; + } + c++; + } + return c; +} + +char *trimquotes(char *c) +{ + char *r = c+1; + r[strlen(r)-1] = '\0'; + return r; +} + void pdie(const char *msg) { perror(msg); diff --git a/util.h b/util.h index d8b40ba..79d12d8 100644 --- a/util.h +++ b/util.h @@ -4,10 +4,7 @@ #include #include -struct list { - void *el; - struct list *tail; -}; +struct list { void *el; struct list *tail; }; struct list *list_cons(void *el, struct list *tail); void list_free(struct list *head, void (*freefun)(void *)); void **list_to_array(struct list *list, int *num, bool reverse); @@ -16,7 +13,14 @@ int list_length(struct list *head); void die(const char *msg, ...); void pdie(const char *msg); +/* if buf == NULL, a fresh buffer is allocated */ +char *escape_char(char c, char *buf, bool str); +/* unescaped character will be in position 0 and the rest from position 1 on */ +char *unescape_char(char *c); +/* Remove the last and first character from the string */ +char *trimquotes(char *c); void pindent(int indent, FILE *out); + void safe_fprintf(FILE *out, const char *msg, ...); void *safe_malloc(size_t size); void *safe_strdup(const char *c); -- 2.20.1