commit 9cca5ce74afac3b0967f04dbff6acdfa07d3af4b
parent 9f14011d13c1e9c2b20af43e06e9ece090bd3222
Author: thing1 <l.standen@posteo.com>
Date: Sun, 12 Oct 2025 14:17:09 +0100
added the ability to lex numbers and have more advanced erorrs
Diffstat:
| M | Makefile | | | 11 | ++++++----- |
| M | lex.c | | | 139 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------- |
| M | lex.h | | | 8 | ++++++-- |
| A | lex_names.c | | | 15 | +++++++++++++++ |
| A | lex_names.h | | | 1 | + |
| M | main.c | | | 67 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ |
| M | test.hlc | | | 3 | ++- |
7 files changed, 194 insertions(+), 50 deletions(-)
diff --git a/Makefile b/Makefile
@@ -1,14 +1,14 @@
CC=c89
-CFLAGS=-ggdb -pedantic -fsanitize=address
-
-SRC = lex.c main.c
+CFLAGS=-ggdb -pedantic -fsanitize=address -Wall -Wextra -D_XOPEN_SOURCE=500 -D_POSIX_C_SOURCE=200112L
+#CFLAGS=-ggdb -pedantic -Wall -Wextra
+SRC = lex.c lex_names.c main.c
OBJ = ${SRC:.c=.o}
-all: spec hlc
+.POSIX:
+all: hlc
spec: spec.md
smu spec.md > spec.html
-
.c.o:
${CC} -c ${CFLAGS} $<
@@ -17,3 +17,4 @@ hlc: ${OBJ}
clean:
rm -rf *.html *.o hlc
+
diff --git a/lex.c b/lex.c
@@ -3,6 +3,7 @@
#include <string.h>
#include <ctype.h>
+
#include "lex.h"
#define strnul(s) (s + strlen(s))
@@ -10,36 +11,50 @@
lex_val lex_error = {UNKNOWN, NULL};
lex_val lv = { 0 };
-FILE *input;
+char *input, *startpos, *endpos;
+size_t input_len = 0;
lex_val *(*nextfn)(void);
+int skipped = 0;
+
+void
+drainfile(FILE *in) {
+ input = malloc(input_len + 1);
+ fread(input, 1, input_len, in);
+ input[input_len] = 0;
+}
int
-getchr(FILE *in) {
+getchr(void) {
int c;
- while ((c = getc(in)) != EOF && (isblank(c) || c == '\n'))
- continue;
- if (c == EOF)
+ skipped = 0;
+ while ((c = *endpos) != 0 && (isblank(c) || c == '\n')) {
+ skipped++;
+ endpos++;
+ }
+ if (c == 0)
lex_error.type = EOI;
+
+ endpos++;
return c;
}
char *
check_bytes(char *str) {
char *s;
- for (s = str; *s; s++) if (getchr(input) != *s) return NULL;
+ for (s = str; *s; s++) if (getchr() != *s) return NULL;
return str;
}
int
-peekc(FILE *in) {
- int c = getchr(in);
- ungetc(c, in);
+peekc(void) {
+ int c = getchr();
+ endpos--;
return c;
}
lex_val *
lex_char(char c, enum lex_type t) {
- if (getchr(input) != c) return &lex_error;
+ if (getchr() != c) return &lex_error;
lv.type = t;
lv.data = NULL;
return &lv;
@@ -47,7 +62,7 @@ lex_char(char c, enum lex_type t) {
lex_val *
lex_type(void) {
- switch (peekc(input)) {
+ switch (peekc()) {
case 'b':
if (!(lv.data = check_bytes("byte"))) return &lex_error;
lv.type = BYTE;
@@ -66,7 +81,7 @@ lex_type(void) {
break;
default: return &lex_error;
}
- if (peekc(input) == '*') nextfn = &lex_type;
+ if (peekc() == '*') nextfn = &lex_type;
else nextfn = &lex_name;
return &lv;
}
@@ -74,33 +89,34 @@ lex_type(void) {
lex_val *
-lex_comma() {
+lex_comma(void) {
return lex_char(',', COMMA);
}
lex_val *
-lex_semi() {
+lex_semi(void) {
+ nextfn = &lex_type;
return lex_char(';', SEMI);
}
lex_val *
-lex_assign() {
+lex_assign(void) {
nextfn = &lex_value;
return lex_char('=', ASSIGN);
}
lex_val *
-lex_sym() {
- switch (getchr(input)) {
+lex_sym(void) {
+ switch (peekc()) {
case '<':
- switch (peekc(input)) {
- case '=': getchr(input); lv.type = LTE; break;
+ switch (peekc()) {
+ case '=': getchr(); lv.type = LTE; break;
default: lv.type = LT; break;
}
return &lv;
case '>':
- switch (peekc(input)) {
- case '=': getchr(input); lv.type = GTE; break;
+ switch (peekc()) {
+ case '=': getchr(); lv.type = GTE; break;
default: lv.type = GT; break;
}
return &lv;
@@ -108,18 +124,49 @@ lex_sym() {
if (!check_bytes("=")) return &lex_error;
lv.type = NEQ;
break;
+ case '=': return lex_assign();
+ case ',': return lex_comma();
+ case ';': return lex_semi();
+
case '+': lv.type = LT; break;
case '-': lv.type = LT; break;
case '/': lv.type = LT; break;
case '*': lv.type = LT; break;
case '&': lv.type = LT; break;
}
+
+ return &lv;
+}
+
+/* TODO read negative values */
+lex_val *
+lex_number(void) {
+ static char num[16];
+ char i, c;
+ for (i = 0, c = getchr(); i < 16 && isdigit(c); i++, c = getchr())
+ num[(int)i] = c;
+ endpos--;
+
+ lv.type = NUM;
+ lv.data = num;
+
+ nextfn = &lex_sym;
+
+ return &lv;
}
/* TODO make this read ints and other litterals */
lex_val *
-lex_value() {
- return lex_name();
+lex_value(void) {
+ char c = peekc();
+ if (isdigit(c)) /* number */
+ return lex_number();
+ switch (c) {
+ case '\"': /* strlit */
+
+ default: /* name */
+ return lex_name();
+ }
}
lex_val *
@@ -130,26 +177,25 @@ lex_name(void) {
memset(name, 0, 32); /* note this resets the previous value,
when converting to an ast, use a dup */
- if (peekc(input) == '*') {
- getchr(input);
+ if (peekc() == '*') {
+ getchr();
lv.type = DEREF;
return &lv;
}
- if (!isalpha((c = getchr(input)))) return &lex_error;
+ if (!isalpha((c = getchr()))) return &lex_error;
do {
memcpy(strnul(name), &c, 1);
if ((len++ + 1) == 32) return &lex_error;
- } while(isalnum((c = getc(input))));
- ungetc(c, input);
+ } while(isalnum((c = getchr())) && skipped == 0);
+ endpos--;
lv.data = name;
lv.type = NAME;
- switch (peekc(input)) {
- case '=': nextfn = &lex_assign; break; /* CHECK FOR == and = */
- case ',': nextfn = &lex_comma; break; /* leads to lex_type */
- case ';': nextfn = &lex_semi; break;
-
+ switch (peekc()) {
+ case '=':
+ case ',':
+ case ';':
case '<':
case '>':
case '!':
@@ -168,12 +214,33 @@ lex_name(void) {
void
init_lexer(FILE *in) {
- input = in;
+ fseek(in, 0, SEEK_END);
+ input_len = ftell(in);
+ rewind(in);
+
+ drainfile(in);
+ startpos = input;
+ endpos = startpos;
+
nextfn = &lex_type;
}
+int
+get_err_len(void) {
+ return endpos - startpos;
+}
+
+int get_line_num(char *s) {
+ int count = 1;
+ size_t i;
+ for (i = 0; i < input_len && &input[i] != s; i++)
+ if (input[i] == '\n') count++;
+ return count;
+}
+
lex_val *
-get_next() {
- memset(&lv, 0, sizeof(typeof(lv)));
+get_next(void) {
+ memset(&lv, 0, sizeof(lv));
+ startpos = endpos;
return (nextfn) ? nextfn() : NULL;
}
diff --git a/lex.h b/lex.h
@@ -1,4 +1,3 @@
-
enum lex_type {
UNKNOWN,
@@ -38,6 +37,9 @@ typedef struct lex_val {
char *data;
} lex_val;
+extern char *input, *startpos, *endpos;
+extern lex_val *(*nextfn)(void);
+
lex_val *lex_name(void);
lex_val *lex_value(void);
lex_val *lex_sym(void);
@@ -45,5 +47,7 @@ lex_val *lex_comma(void);
lex_val *lex_semi(void);
lex_val *lex_type(void);
+int get_err_len(void);
+int get_line_num(char *);
lex_val *get_next(void);
-void init_lexer(FILE *f);
+void init_lexer(FILE *);
diff --git a/lex_names.c b/lex_names.c
@@ -0,0 +1,15 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "lex.h"
+
+#define is(f) if (func == f)
+
+char *get_name(lex_val *(*func)(void)) {
+ is(lex_name) return "name";
+ is(lex_type) return "type";
+ is(lex_value) return "value";
+ is(lex_semi) return ";";
+ is(lex_sym) return "symbol";
+ is(lex_comma) return "comma";
+ return NULL;
+}
diff --git a/lex_names.h b/lex_names.h
@@ -0,0 +1 @@
+char *get_name(lex_val *(*func)(void));
diff --git a/main.c b/main.c
@@ -1,19 +1,74 @@
#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stddef.h>
#include "lex.h"
+#include "lex_names.h"
+
+FILE *in;
+
+void
+cleanup() {
+ free(input);
+ fclose(in);
+}
+
+void
+errorf(char *fmt, ...) {
+ va_list ap;
+ char str[256];
+ va_start(ap, fmt);
+ vsnprintf(str, 255, fmt, ap);
+ va_end(ap);
+ fprintf(stderr, "%s\n", str);
+}
+
+void
+syntax_error() {
+ char *loc = startpos - 1, *line = loc, *linestart;
+ int i, off;
+
+ errorf("Syntax error: Expected %s, got %.*s", get_name(nextfn), endpos - startpos, loc);
+
+ while (line != input && *line != '\n') line--;
+ if (*line == '\n') line++;
+ linestart = line;
+
+ off = fprintf(stderr, "%d: ", get_line_num(loc));
+ while (*line && *line != '\n')
+ putc(*line++, stderr);
+
+ putc('\n', stderr);
+
+ while (linestart - off != loc) {
+ putc(' ', stderr);
+ linestart++;
+ }
+
+ for (i = 0; i < get_err_len(); i++)
+ putc('^', stderr);
+ putc('\n', stderr);
+
+}
int
main() {
lex_val *val;
- FILE *f = fopen("test.hlc", "r");
- init_lexer(f);
- while (val = get_next()) {
+ in = fopen("test.hlc", "r");
+ init_lexer(in);
+
+ while ((val = get_next())) {
if (val->type == EOI)
break;
- if (val->type == UNKNOWN)
- fprintf(stderr, "unknown syntax\n");
+ if (val->type == UNKNOWN) {
+ syntax_error();
+ cleanup();
+ return 1;
+ }
}
- fclose(f);
+ cleanup();
+ return 0;
}
diff --git a/test.hlc b/test.hlc
@@ -1 +1,2 @@
-byte *name = *first;
+short age = 21;
+