1 files changed, 417 insertions, 0 deletions
diff --git a/encoding/json/lex.ha b/encoding/json/lex.ha
new file mode 100644
index 0000000..7b9bf12
--- /dev/null
+++ b/encoding/json/lex.ha
@@ -0,0 +1,417 @@
+// License: MPL-2.0
+// (c) 2022 Drew DeVault <sir@cmpwn.com>
+use ascii;
+use bufio;
+use encoding::utf8;
+use io;
+use os;
+use strconv;
+use strings;
+use memio;
+
+export type lexer = struct {
+	src: io::handle,
+	strbuf: memio::stream,
+	un: (token | void),
+	rb: (rune | void),
+	loc: (uint, uint),
+	prevloc: (uint, uint),
+	nextloc: (uint, uint),
+	prevrloc: (uint, uint),
+};
+
+// Creates a new JSON lexer. The caller may obtain tokens with [[lex]] and
+// should pass the result to [[close]] when they're done with it.
+export fn newlexer(src: io::handle) lexer = lexer {
+	src = src,
+	strbuf = memio::dynamic(),
+	un = void,
+	rb = void,
+	loc = (1, 0),
+	...
+};
+
+// Frees state associated with a JSON lexer.
+export fn close(lex: *lexer) void = {
+	io::close(&lex.strbuf)!;
+};
+
+// Returns the next token from a JSON lexer. The return value is borrowed from
+// the lexer and will be overwritten on subsequent calls.
+export fn lex(lex: *lexer) (token | io::EOF | error) = {
+	match (lex.un) {
+	case void =>
+		lex.prevloc = lex.loc;
+	case let tok: token =>
+		lex.un = void;
+		lex.prevloc = lex.loc;
+		lex.loc = lex.nextloc;
+		return tok;
+	};
+
+	const rn = match (nextrunews(lex)?) {
+	case io::EOF =>
+		return io::EOF;
+	case let rn: rune =>
+		yield rn;
+	};
+
+	switch (rn) {
+	case '[' =>
+		return arraystart;
+	case ']' =>
+		return arrayend;
+	case '{' =>
+		return objstart;
+	case '}' =>
+		return objend;
+	case ',' =>
+		return comma;
+	case ':' =>
+		return colon;
+	case '"' =>
+		return scan_str(lex)?;
+	case =>
+		yield;
+	};
+
+	if (ascii::isdigit(rn) || rn == '-') {
+		unget(lex, rn);
+		return scan_number(lex)?;
+	};
+
+	if (!ascii::isalpha(rn)) {
+		return lex.loc: invalid;
+	};
+
+	unget(lex, rn);
+	const word = scan_word(lex)?;
+	switch (word) {
+	case "true" =>
+		return true;
+	case "false" =>
+		return false;
+	case "null" =>
+		return _null;
+	case =>
+		return lex.loc: invalid;
+	};
+};
+
+// "Unlexes" a token from the lexer, such that the next call to [[lex]] will
+// return that token again. Only one token can be unlexed at a time, otherwise
+// the program will abort.
+export fn unlex(lex: *lexer, tok: token) void = {
+	assert(lex.un is void, "encoding::json::unlex called twice in a row");
+	lex.un = tok;
+	lex.nextloc = lex.loc;
+	lex.loc = lex.prevloc;
+};
+
+// Scans until encountering a non-alphabetical character, returning the
+// resulting word.
+fn scan_word(lex: *lexer) (str | error) = {
+	memio::reset(&lex.strbuf);
+
+	for (true) {
+		const rn = match (nextrune(lex)?) {
+		case let rn: rune =>
+			yield rn;
+		case io::EOF =>
+			break;
+		};
+		if (!ascii::isalpha(rn)) {
+			unget(lex, rn);
+			break;
+		};
+		memio::appendrune(&lex.strbuf, rn)!;
+	};
+
+	return memio::string(&lex.strbuf)!;
+};
+
+type numstate = enum {
+	SIGN,
+	START,
+	ZERO,
+	INTEGER,
+	FRACSTART,
+	FRACTION,
+	EXPSIGN,
+	EXPSTART,
+	EXPONENT,
+};
+
+fn scan_number(lex: *lexer) (token | error) = {
+	memio::reset(&lex.strbuf);
+
+	let state = numstate::SIGN;
+	for (true) {
+		const rn = match (nextrune(lex)?) {
+		case let rn: rune =>
+			yield rn;
+		case io::EOF =>
+			break;
+		};
+
+		switch (state) {
+		case numstate::SIGN =>
+			state = numstate::START;
+			if (rn != '-') {
+				unget(lex, rn);
+				continue;
+			};
+		case numstate::START =>
+			switch (rn) {
+			case '0' =>
+				state = numstate::ZERO;
+			case =>
+				if (!ascii::isdigit(rn)) {
+					return lex.loc: invalid;
+				};
+				state = numstate::INTEGER;
+			};
+		case numstate::ZERO =>
+			switch (rn) {
+			case '.' =>
+				state = numstate::FRACSTART;
+			case 'e', 'E' =>
+				state = numstate::EXPSIGN;
+			case =>
+				if (ascii::isdigit(rn)) {
+					return lex.loc: invalid;
+				};
+				unget(lex, rn);
+				break;
+			};
+		case numstate::INTEGER =>
+			switch (rn) {
+			case '.' =>
+				state = numstate::FRACSTART;
+			case 'e', 'E' =>
+				state = numstate::EXPSIGN;
+			case =>
+				if (!ascii::isdigit(rn)) {
+					unget(lex, rn);
+					break;
+				};
+			};
+		case numstate::FRACSTART =>
+			if (!ascii::isdigit(rn)) {
+				return lex.loc: invalid;
+			};
+			state = numstate::FRACTION;
+		case numstate::FRACTION =>
+			switch (rn) {
+			case 'e', 'E' =>
+				state = numstate::EXPSIGN;
+			case =>
+				if (!ascii::isdigit(rn)) {
+					unget(lex, rn);
+					break;
+				};
+			};
+		case numstate::EXPSIGN =>
+			state = numstate::EXPSTART;
+			if (rn != '+' && rn != '-') {
+				unget(lex, rn);
+				continue;
+			};
+		case numstate::EXPSTART =>
+			if (!ascii::isdigit(rn)) {
+				return lex.loc: invalid;
+			};
+			state = numstate::EXPONENT;
+		case numstate::EXPONENT =>
+			if (!ascii::isdigit(rn)) {
+				unget(lex, rn);
+				break;
+			};
+		};
+
+		memio::appendrune(&lex.strbuf, rn)!;
+	};
+
+	match (strconv::stof64(memio::string(&lex.strbuf)!)) {
+	case let f: f64 =>
+		return f;
+	case =>
+		return lex.loc: invalid;
+	};
+};
+
+fn scan_str(lex: *lexer) (token | error) = {
+	memio::reset(&lex.strbuf);
+
+	for (true) {
+		const rn = match (nextrune(lex)?) {
+		case let rn: rune =>
+			yield rn;
+		case io::EOF =>
+			lex.loc.1 += 1;
+			return lex.loc: invalid;
+		};
+
+		switch (rn) {
+		case '"' =>
+			break;
+		case '\\' =>
+			const rn = scan_escape(lex)?;
+			memio::appendrune(&lex.strbuf, rn)!;
+		case =>
+			if (iscntrl(rn)) {
+				return lex.loc: invalid;
+			};
+			memio::appendrune(&lex.strbuf, rn)!;
+		};
+	};
+
+	return memio::string(&lex.strbuf)!;
+};
+
+fn scan_escape(lex: *lexer) (rune | error) = {
+	const rn = match (nextrune(lex)?) {
+	case let rn: rune =>
+		yield rn;
+	case io::EOF =>
+		return lex.loc: invalid;
+	};
+
+	switch (rn) {
+	case '\"' =>
+		return '\"';
+	case '\\' =>
+		return '\\';
+	case '/' =>
+		return '/';
+	case 'b' =>
+		return '\b';
+	case 'f' =>
+		return '\f';
+	case 'n' =>
+		return '\n';
+	case 'r' =>
+		return '\r';
+	case 't' =>
+		return '\t';
+	case 'u' =>
+		const u = scan_escape_codepoint(lex)?;
+
+		if (u >= 0xd800 && u <= 0xdfff) {
+			if (u >= 0xdc00) {
+				return lex.loc: invalid;
+			};
+
+			const rn = match (nextrune(lex)?) {
+			case let rn: rune =>
+				yield rn;
+			case io::EOF =>
+				return lex.loc: invalid;
+			};
+			if (rn != '\\') {
+				return lex.loc: invalid;
+			};
+			const rn = match (nextrune(lex)?) {
+			case let rn: rune =>
+				yield rn;
+			case io::EOF =>
+				return lex.loc: invalid;
+			};
+			if (rn != 'u') {
+				return lex.loc: invalid;
+			};
+
+			const v = scan_escape_codepoint(lex)?;
+			if (v < 0xdc00 || v > 0xdfff) {
+				return lex.loc: invalid;
+			};
+
+			const hi = u: u32 & 0x03ff;
+			const lo = v: u32 & 0x03ff;
+			return ((hi >> 10 | lo) + 0x10000): rune;
+		};
+
+		return u: u32: rune;
+	case =>
+		return lex.loc: invalid;
+	};
+};
+
+fn scan_escape_codepoint(lex: *lexer) (u16 | error) = {
+	let buf: [4]u8 = [0...];
+	match (io::readall(lex.src, buf)?) {
+	case io::EOF =>
+		return lex.loc: invalid;
+	case size =>
+		yield;
+	};
+	const s = match (strings::fromutf8(buf)) {
+	case let s: str =>
+		yield s;
+	case =>
+		return lex.loc: invalid;
+	};
+	match (strconv::stou16(s, strconv::base::HEX)) {
+	case let u: u16 =>
+		lex.loc.1 += 4;
+		return u;
+	case =>
+		return lex.loc: invalid;
+	};
+};
+
+// Gets the next rune from the lexer.
+fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
+	if (lex.rb is rune) {
+		lex.prevrloc = lex.loc;
+		const r = lex.rb as rune;
+		lex.rb = void;
+		if (r == '\n') {
+			lex.loc = (lex.loc.0 + 1, 0);
+		} else {
+			lex.loc.1 += 1;
+		};
+		return r;
+	};
+	match (bufio::read_rune(lex.src)) {
+	case let err: io::error =>
+		return err;
+	case utf8::invalid =>
+		return lex.loc: invalid;
+	case io::EOF =>
+		return io::EOF;
+	case let rn: rune =>
+		lex.prevrloc = lex.loc;
+		if (rn == '\n') {
+			lex.loc = (lex.loc.0 + 1, 0);
+		} else {
+			lex.loc.1 += 1;
+		};
+		return rn;
+	};
+};
+
+// Like nextrune but skips whitespace.
+fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
+	for (true) {
+		match (nextrune(lex)?) {
+		case let rn: rune =>
+			if (isspace(rn)) {
+				continue;
+			};
+			return rn;
+		case io::EOF =>
+			return io::EOF;
+		};
+	};
+};
+
+fn unget(lex: *lexer, r: rune) void = {
+	assert(lex.rb is void);
+	lex.rb = r;
+	lex.loc = lex.prevrloc;
+};
+
+fn iscntrl(r: rune) bool = r: u32 < 0x20;
+
+fn isspace(r: rune) bool = ascii::isspace(r) && r != '\f';