lex.ha (5693B)
1 use regex; 2 use strconv; 3 use fmt; 4 use strings; 5 use encoding::utf8; 6 7 // The default token constructor. 8 export fn default_token( 9 scan: *scanner, 10 name: const str, 11 value: value, 12 morphene: const str, 13 lexeme: const str, 14 ) (*token | error) = { 15 const end = scan.start; 16 const last = end; 17 const decoder = utf8::decode(strings::toutf8(morphene)); 18 for (let r => utf8::next(&decoder)!) { 19 last = end; 20 forwardr(&end, [r as rune]); 21 }; 22 return alloc(token { 23 name = name, 24 value = value, 25 morphene = morphene, 26 lexeme = lexeme, 27 start = scan.start, 28 end = last, 29 tostrfn = &tokstr, 30 freefn = &tokfree, 31 })?; 32 }; 33 34 fn tokstr(tok: *token) str = { 35 match (tok.value) { 36 case void => return tok.morphene; 37 case let val: f64 => return strconv::f64tos(val); 38 case let val: i64 => return strconv::i64tos(val); 39 case let val: size => return strconv::ztos(val); 40 case let val: u64 => return strconv::u64tos(val); 41 case let val: str => return val; 42 case let val: rune => return strings::fromutf8_unsafe(utf8::encoderune(val)); 43 }; 44 }; 45 46 fn tokfree(tok: *token) void = { 47 free(tok); 48 }; 49 50 // Format a token as a string. 51 export fn strtoken(tok: *token) str = tok.tostrfn(tok); 52 53 // Initialize a new [[lexer]] to lex the input bytes. The caller must free 54 // associated resources with [[finish]]. 55 export fn init( 56 be: *backend, 57 in: const str = "", 58 tokfn: nullable *tokenfn = null, 59 ) lexer = { 60 const loc = location { 61 off = 0, 62 line = 1, 63 col = 1, 64 }; 65 return lexer { 66 be = be, 67 in = in, 68 token = if (tokfn is *tokenfn) tokfn as *tokenfn else &default_token, 69 un = null, 70 loc = loc, 71 prevunlocs = [(loc, loc)...], 72 ... 73 }; 74 }; 75 76 // Convenient function to reuse an existing lexer with a new input string. 77 export fn reuse(lex: *lexer, in: const str) void = { 78 const loc = location { 79 off = 0, 80 line = 1, 81 col = 1, 82 }; 83 lex.in = in; 84 lex.un = null; 85 lex.loc = loc; 86 lex.prevunlocs = [(loc, loc)...]; 87 for (let tok .. lex.tokens) 88 tok.freefn(tok); 89 delete(lex.tokens[..]); 90 match (lex.reuse) { 91 case null => void; 92 case let reuse: *reusecb => reuse(lex); 93 }; 94 }; 95 96 // Free resources associated with a [[lexer]]. 97 export fn finish(lex: *lexer) void = { 98 for (let tok .. lex.tokens) 99 tok.freefn(tok); 100 free(lex.tokens); 101 }; 102 103 // Format a [[syntax]] error as a formatable. 104 export fn syntaxf(loc: location, fmt: const str, args: fmt::field...) syntax = { 105 static let buf: [2048]u8 = [0...]; 106 const msg = fmt::bsprintf(buf, fmt, args...)!; 107 return (loc, msg); 108 }; 109 110 // Gives the current location of the lexer. 111 export fn mkloc(lex: *lexer) location = { 112 match (lex.un) { 113 case null => return lex.loc; 114 case let tok: *token => return lex.prevunlocs[1].1; 115 }; 116 }; 117 118 // Gives the previous location of the lexer. 119 export fn prevloc(lex: *lexer) location = { 120 match (lex.un) { 121 case null => return lex.prevrloc; 122 case let tok: *token => return lex.prevunlocs[1].0; 123 }; 124 }; 125 126 // Initialize a token based on the scan context. When the lexeme is not present, 127 // this considers the morphene as both. When the name is not present, the name 128 // comes from the action will be use. 129 export fn scan_token( 130 scan: *scanner, 131 value: value, 132 first: const str, 133 second: const str = "", 134 name: const str = "" 135 ) (*token | error) = { 136 const (morphene, lexeme) = if (second == "") { 137 yield (first, first); 138 } else { 139 yield (first, second); 140 }; 141 return scan.lex.token( 142 scan, 143 if (name == "") scan.name else name, 144 value, 145 morphene, 146 lexeme 147 ); 148 }; 149 150 // Return the lexer associated to a scanner. 151 export fn scan_lexer(scan: *scanner) *lexer = scan.lex; 152 153 // Return the token name associated to a scanner. 154 export fn scan_name(scan: *scanner) str = scan.name; 155 156 // Return a string representing the error. 157 export fn strerror(err: error) str = { 158 static let buf: [2048]u8 = [0...]; 159 match (err) { 160 case let s: syntax => 161 return fmt::bsprintf(buf, "{}:{}: syntax error: {}", 162 s.0.line, s.0.col, s.1)!; 163 case let com: compile => return com; 164 case nomem => return "nomem"; 165 }; 166 }; 167 168 // Give the next token from the lexer. 169 export fn next(lex: *lexer) (*token | error) = { 170 if (lex.un is *token) { 171 const prev = lex.un as *token; 172 lex.un = null; 173 return prev; 174 }; 175 176 defer { 177 lex.prevunlocs[1] = lex.prevunlocs[0]; 178 lex.prevunlocs[0] = (prevloc(lex), mkloc(lex)); 179 }; 180 181 182 let scan = scanner { 183 lex = lex, 184 start = lex.loc, 185 ... 186 }; 187 188 for (true) { 189 scan.in = strings::sub(lex.in, lex.loc.off, strings::end); 190 if (len(scan.in) == 0) { 191 const tok = scan.lex.token(&scan, EOF, void, "", "")?; 192 append(lex.tokens, tok)?; 193 return tok; 194 }; 195 196 const (action, lexeme) = match (perform(lex.be, scan.in)) { 197 case void => return syntaxf(mkloc(lex), "no matching token"); 198 case let this: (*action, str) => yield this; 199 }; 200 201 scan.name = action.name; 202 203 match (action.cb(&scan, lexeme, action.user)?) { 204 case let lexeme: str => 205 forwardlex(lex, lexeme); 206 scan.start = lex.loc; 207 case let tok: *token => 208 forwardlex(lex, tok.lexeme); 209 append(lex.tokens, tok)?; 210 return tok; 211 }; 212 }; 213 }; 214 215 // Unlex a token, so that it get lexed back with [[next]]. 216 export fn unlex(lex: *lexer, value: *token) void = { 217 assert(lex.un is null); 218 lex.un = value; 219 }; 220 221 fn forwardlex(lex: *lexer, in: str) void = { 222 const decoder = utf8::decode(strings::toutf8(in)); 223 for (let r => utf8::next(&decoder)!) { 224 lex.prevrloc = lex.loc; 225 forwardr(&lex.loc, [r as rune]); 226 }; 227 }; 228 229 // Moves a location based on bytes. 230 export fn forward(loc: *location, in: str) void = { 231 const decoder = utf8::decode(strings::toutf8(in)); 232 for (let r => utf8::next(&decoder)!) { 233 forwardr(loc, [r as rune]); 234 }; 235 }; 236 237 fn forwardr(loc: *location, r: []rune) void = { 238 for (let r .. r) { 239 loc.off += 1; 240 loc.col += 1; 241 if (r == '\n') { 242 loc.col = 1; 243 loc.line += 1; 244 }; 245 }; 246 };