lex.ha - sys - A set of unix utils in hare!

lex.ha (5693B)
      1 use regex;
      2 use strconv;
      3 use fmt;
      4 use strings;
      5 use encoding::utf8;
      6 
      7 // The default token constructor.
      8 export fn default_token(
      9 	scan: *scanner,
     10 	name: const str,
     11 	value: value,
     12 	morphene: const str,
     13 	lexeme: const str,
     14 ) (*token | error) = {
     15 	const end = scan.start;
     16 	const last = end;
     17 	const decoder = utf8::decode(strings::toutf8(morphene));
     18 	for (let r => utf8::next(&decoder)!) {
     19 		last = end;
     20 		forwardr(&end, [r as rune]);
     21 	};
     22 	return alloc(token {
     23 		name = name,
     24 		value = value,
     25 		morphene = morphene,
     26 		lexeme = lexeme,
     27 		start = scan.start,
     28 		end = last,
     29 		tostrfn = &tokstr,
     30 		freefn = &tokfree,
     31 	})?;
     32 };
     33 
     34 fn tokstr(tok: *token) str = {
     35 	match (tok.value) {
     36 	case void => return tok.morphene;
     37 	case let val: f64 => return strconv::f64tos(val);
     38 	case let val: i64 => return strconv::i64tos(val);
     39 	case let val: size => return strconv::ztos(val);
     40 	case let val: u64 => return strconv::u64tos(val);
     41 	case let val: str => return val;
     42 	case let val: rune => return strings::fromutf8_unsafe(utf8::encoderune(val));
     43 	};
     44 };
     45 
     46 fn tokfree(tok: *token) void = {
     47 	free(tok);
     48 };
     49 
     50 // Format a token as a string.
     51 export fn strtoken(tok: *token) str = tok.tostrfn(tok);
     52 
     53 // Initialize a new [[lexer]] to lex the input bytes. The caller must free
     54 // associated resources with [[finish]].
     55 export fn init(
     56 	be: *backend,
     57 	in: const str = "",
     58 	tokfn: nullable *tokenfn = null,
     59 ) lexer = {
     60 	const loc = location {
     61 		off = 0,
     62 		line = 1,
     63 		col = 1,
     64 	};
     65 	return lexer {
     66 		be = be,
     67 		in = in,
     68 		token = if (tokfn is *tokenfn) tokfn as *tokenfn else &default_token,
     69 		un = null,
     70 		loc = loc,
     71 		prevunlocs = [(loc, loc)...],
     72 		...
     73 	};
     74 };
     75 
     76 // Convenient function to reuse an existing lexer with a new input string.
     77 export fn reuse(lex: *lexer, in: const str) void = {
     78 	const loc = location {
     79 		off = 0,
     80 		line = 1,
     81 		col = 1,
     82 	};
     83 	lex.in = in;
     84 	lex.un = null;
     85 	lex.loc = loc;
     86 	lex.prevunlocs = [(loc, loc)...];
     87 	for (let tok .. lex.tokens)
     88 		tok.freefn(tok);
     89 	delete(lex.tokens[..]);
     90 	match (lex.reuse) {
     91 	case null => void;
     92 	case let reuse: *reusecb => reuse(lex);
     93 	};
     94 };
     95 
     96 // Free resources associated with a [[lexer]].
     97 export fn finish(lex: *lexer) void = {
     98 	for (let tok .. lex.tokens)
     99 		tok.freefn(tok);
    100 	free(lex.tokens);
    101 };
    102 
    103 // Format a [[syntax]] error as a formatable.
    104 export fn syntaxf(loc: location, fmt: const str, args: fmt::field...) syntax = {
    105 	static let buf: [2048]u8 = [0...];
    106 	const msg = fmt::bsprintf(buf, fmt, args...)!;
    107 	return (loc, msg);
    108 };
    109 
    110 // Gives the current location of the lexer.
    111 export fn mkloc(lex: *lexer) location = {
    112 	match (lex.un) {
    113 	case null => return lex.loc;
    114 	case let tok: *token => return lex.prevunlocs[1].1;
    115 	};
    116 };
    117 
    118 // Gives the previous location of the lexer.
    119 export fn prevloc(lex: *lexer) location = {
    120 	match (lex.un) {
    121 	case null => return lex.prevrloc;
    122 	case let tok: *token => return lex.prevunlocs[1].0;
    123 	};
    124 };
    125 
    126 // Initialize a token based on the scan context. When the lexeme is not present,
    127 // this considers the morphene as both. When the name is not present, the name
    128 // comes from the action will be use.
    129 export fn scan_token(
    130 	scan: *scanner,
    131 	value: value,
    132 	first: const str,
    133 	second: const str = "",
    134 	name: const str = ""
    135 ) (*token | error) = {
    136 	const (morphene, lexeme) = if (second == "") {
    137 		yield (first, first);
    138 	} else {
    139 		yield (first, second);
    140 	};
    141 	return scan.lex.token(
    142 		scan,
    143 		if (name == "") scan.name else name,
    144 		value,
    145 		morphene,
    146 		lexeme
    147 	);
    148 };
    149 
    150 // Return the lexer associated to a scanner.
    151 export fn scan_lexer(scan: *scanner) *lexer = scan.lex;
    152 
    153 // Return the token name associated to a scanner.
    154 export fn scan_name(scan: *scanner) str = scan.name;
    155 
    156 // Return a string representing the error.
    157 export fn strerror(err: error) str = {
    158 	static let buf: [2048]u8 = [0...];
    159 	match (err) {
    160 	case let s: syntax =>
    161 		return fmt::bsprintf(buf, "{}:{}: syntax error: {}",
    162 			s.0.line, s.0.col, s.1)!;
    163 	case let com: compile => return com;
    164 	case nomem => return "nomem";
    165 	};
    166 };
    167 
    168 // Give the next token from the lexer.
    169 export fn next(lex: *lexer) (*token | error) = {
    170 	if (lex.un is *token) {
    171 		const prev = lex.un as *token;
    172 		lex.un = null;
    173 		return prev;
    174 	};
    175 
    176 	defer {
    177 		lex.prevunlocs[1] = lex.prevunlocs[0];
    178 		lex.prevunlocs[0] = (prevloc(lex), mkloc(lex));
    179 	};
    180 
    181 
    182 	let scan = scanner {
    183 		lex = lex,
    184 		start = lex.loc,
    185 		...
    186 	};
    187 
    188 	for (true) {
    189 		scan.in = strings::sub(lex.in, lex.loc.off, strings::end);
    190 		if (len(scan.in) == 0) {
    191 			const tok = scan.lex.token(&scan, EOF, void, "", "")?;
    192 			append(lex.tokens, tok)?;
    193 			return tok;
    194 		};
    195 
    196 		const (action, lexeme) = match (perform(lex.be, scan.in)) {
    197 		case void => return syntaxf(mkloc(lex), "no matching token");
    198 		case let this: (*action, str) => yield this;
    199 		};
    200 
    201 		scan.name = action.name;
    202 
    203 		match (action.cb(&scan, lexeme, action.user)?) {
    204 		case let lexeme: str =>
    205 			forwardlex(lex, lexeme);
    206 			scan.start = lex.loc;
    207 		case let tok: *token =>
    208 			forwardlex(lex, tok.lexeme);
    209 			append(lex.tokens, tok)?;
    210 			return tok;
    211 		};
    212 	};
    213 };
    214 
    215 // Unlex a token, so that it get lexed back with [[next]].
    216 export fn unlex(lex: *lexer, value: *token) void = {
    217 	assert(lex.un is null);
    218 	lex.un = value;
    219 };
    220 
    221 fn forwardlex(lex: *lexer, in: str) void = {
    222 	const decoder = utf8::decode(strings::toutf8(in));
    223 	for (let r => utf8::next(&decoder)!) {
    224 		lex.prevrloc = lex.loc;
    225 		forwardr(&lex.loc, [r as rune]);
    226 	};
    227 };
    228 
    229 // Moves a location based on bytes.
    230 export fn forward(loc: *location, in: str) void = {
    231 	const decoder = utf8::decode(strings::toutf8(in));
    232 	for (let r => utf8::next(&decoder)!) {
    233 		forwardr(loc, [r as rune]);
    234 	};
    235 };
    236 
    237 fn forwardr(loc: *location, r: []rune) void = {
    238 	for (let r .. r) {
    239 		loc.off += 1;
    240 		loc.col += 1;
    241 		if (r == '\n') {
    242 			loc.col = 1;
    243 			loc.line += 1;
    244 		};
    245 	};
    246 };