lex.ha (3392B)
1 use io; 2 use ascii; 3 use fmt; 4 use strings; 5 6 export type range = (size, size); 7 8 export type unknowntoken = !range; 9 export type other = !range; 10 export type error = !(unknowntoken | other); 11 12 type asciifn = fn(r: rune) bool; 13 14 export type ttype = enum { 15 EOF = -1, 16 OBRACE = '(', 17 CBRACE = ')', 18 OCBRACE = '{', 19 CCBRACE = '}', 20 OSBRACE = '[', 21 CSBRACE = ']', 22 ASSIGN = '=', 23 SEMI = ';', 24 25 ADD = '+', 26 SUB = '-', 27 MUL= '*', 28 DIV = '/', 29 MOD = '%', 30 31 LT = '<', 32 GT = '>', 33 NOT = '!', 34 35 EQU, // == 36 LTE, // <= 37 GTE, // >= 38 OR, // || 39 AND, // && 40 41 FUNC, // func 42 IF, // if 43 44 NAME, 45 NUMBER, 46 }; 47 48 export type lexer = struct { 49 in: []u8, 50 pos: size, 51 prev: []size 52 }; 53 54 export type token = struct { 55 ty: ttype, 56 // range of data 57 // "func main() i32 ..." 58 // ^ ^ ^ ^^^ ^ ^ 59 // ^^ 60 data: range, 61 }; 62 63 export fn strerror(e: error, l: *lexer) str = match (e) { 64 case let e: unknowntoken => yield fmt::asprintf("Unknown token \"{}\"", strings::fromutf8(l.in[e.0 .. e.1])!)!; 65 case => yield "unknown error"; 66 }; 67 68 export fn finish(l: *lexer) void = { 69 free(l.prev); 70 }; 71 72 fn readblock(l: *lexer, pred: *asciifn) range = { 73 let start = l.pos; 74 for (pred(l.in[l.pos]: rune); l.pos += 1) 75 continue; 76 return (start, l.pos); 77 }; 78 79 fn isnumber(r: rune) bool = ascii::isdigit(r) || (r == '-'); 80 fn isname(r: rune) bool = ascii::isalpha(r) || isnumber(r); 81 fn iswhitespace(r: rune) bool = ascii::isblank(r) || (r == '\n'); 82 83 fn lexstr(l: *lexer, s: str) (range | void) = { 84 let start = l.pos; 85 86 for (let c .. strings::toutf8(s)) { 87 if (l.in[l.pos] == c) l.pos += 1 88 else { 89 l.pos = start; 90 return; 91 }; 92 }; 93 94 return (start, l.pos); 95 }; 96 97 export fn prev(l: *lexer) void = { 98 l.pos = l.prev[len(l.prev) - 1]; 99 l.prev = l.prev[0 .. len(l.prev) - 1]; 100 }; 101 102 export fn next(l: *lexer) (token | error) = { 103 if (l.pos >= len(l.in)) 104 return token{ty = ttype::EOF, data = (l.pos, l.pos)}; 105 106 if (iswhitespace(l.in[l.pos]: rune)) { 107 l.pos += 1; 108 return next(l); 109 }; 110 append(l.prev, l.pos)!; 111 112 match (lexstr(l, "==")) { 113 case let data: range => return token{ty = ttype::EQU, data = data}; 114 case => yield; 115 }; 116 117 match (lexstr(l, "<=")) { 118 case let data: range => return token{ty = ttype::LTE, data = data}; 119 case => yield; 120 }; 121 122 match (lexstr(l, ">=")) { 123 case let data: range => return token{ty = ttype::GTE, data = data}; 124 case => yield; 125 }; 126 127 match (lexstr(l, "||")) { 128 case let data: range => return token{ty = ttype::OR, data = data}; 129 case => yield; 130 }; 131 132 match (lexstr(l, "&&")) { 133 case let data: range => return token{ty = ttype::AND, data = data}; 134 case => yield; 135 }; 136 137 match (lexstr(l, "func")) { 138 case let data: range => return token{ty = ttype::FUNC, data = data}; 139 case => yield; 140 }; 141 142 match (lexstr(l, "if")) { 143 case let data: range => return token{ty = ttype::IF, data = data}; 144 case => yield; 145 }; 146 147 switch (l.in[l.pos]: ttype) { 148 case ttype::OBRACE, ttype::CBRACE, ttype::OCBRACE, ttype::CCBRACE, 149 ttype::OSBRACE, ttype::CSBRACE, ttype::ASSIGN, 150 ttype::SEMI, ttype::ADD, ttype::SUB, ttype::MUL, 151 ttype::DIV, ttype::MOD => 152 defer l.pos += 1; 153 return token{ty = l.in[l.pos]: ttype, data = (l.pos, l.pos + 1)}; 154 case => yield; 155 }; 156 157 if (ascii::isalpha(l.in[l.pos]: rune)) 158 return token{ty = ttype::NAME, data = readblock(l, &isname)} 159 else if (isnumber(l.in[l.pos]: rune)) 160 return token{ty = ttype::NUMBER, data = readblock(l, &isnumber)}; 161 162 defer l.pos += 1; 163 return (l.pos, l.pos + 1): unknowntoken; 164 };