diff options
Diffstat (limited to 'encoding/json/lex.ha')
-rw-r--r-- | encoding/json/lex.ha | 417 |
1 files changed, 417 insertions, 0 deletions
diff --git a/encoding/json/lex.ha b/encoding/json/lex.ha new file mode 100644 index 0000000..7b9bf12 --- /dev/null +++ b/encoding/json/lex.ha @@ -0,0 +1,417 @@ +// License: MPL-2.0 +// (c) 2022 Drew DeVault <sir@cmpwn.com> +use ascii; +use bufio; +use encoding::utf8; +use io; +use os; +use strconv; +use strings; +use memio; + +export type lexer = struct { + src: io::handle, + strbuf: memio::stream, + un: (token | void), + rb: (rune | void), + loc: (uint, uint), + prevloc: (uint, uint), + nextloc: (uint, uint), + prevrloc: (uint, uint), +}; + +// Creates a new JSON lexer. The caller may obtain tokens with [[lex]] and +// should pass the result to [[close]] when they're done with it. +export fn newlexer(src: io::handle) lexer = lexer { + src = src, + strbuf = memio::dynamic(), + un = void, + rb = void, + loc = (1, 0), + ... +}; + +// Frees state associated with a JSON lexer. +export fn close(lex: *lexer) void = { + io::close(&lex.strbuf)!; +}; + +// Returns the next token from a JSON lexer. The return value is borrowed from +// the lexer and will be overwritten on subsequent calls. +export fn lex(lex: *lexer) (token | io::EOF | error) = { + match (lex.un) { + case void => + lex.prevloc = lex.loc; + case let tok: token => + lex.un = void; + lex.prevloc = lex.loc; + lex.loc = lex.nextloc; + return tok; + }; + + const rn = match (nextrunews(lex)?) { + case io::EOF => + return io::EOF; + case let rn: rune => + yield rn; + }; + + switch (rn) { + case '[' => + return arraystart; + case ']' => + return arrayend; + case '{' => + return objstart; + case '}' => + return objend; + case ',' => + return comma; + case ':' => + return colon; + case '"' => + return scan_str(lex)?; + case => + yield; + }; + + if (ascii::isdigit(rn) || rn == '-') { + unget(lex, rn); + return scan_number(lex)?; + }; + + if (!ascii::isalpha(rn)) { + return lex.loc: invalid; + }; + + unget(lex, rn); + const word = scan_word(lex)?; + switch (word) { + case "true" => + return true; + case "false" => + return false; + case "null" => + return _null; + case => + return lex.loc: invalid; + }; +}; + +// "Unlexes" a token from the lexer, such that the next call to [[lex]] will +// return that token again. Only one token can be unlexed at a time, otherwise +// the program will abort. +export fn unlex(lex: *lexer, tok: token) void = { + assert(lex.un is void, "encoding::json::unlex called twice in a row"); + lex.un = tok; + lex.nextloc = lex.loc; + lex.loc = lex.prevloc; +}; + +// Scans until encountering a non-alphabetical character, returning the +// resulting word. +fn scan_word(lex: *lexer) (str | error) = { + memio::reset(&lex.strbuf); + + for (true) { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + break; + }; + if (!ascii::isalpha(rn)) { + unget(lex, rn); + break; + }; + memio::appendrune(&lex.strbuf, rn)!; + }; + + return memio::string(&lex.strbuf)!; +}; + +type numstate = enum { + SIGN, + START, + ZERO, + INTEGER, + FRACSTART, + FRACTION, + EXPSIGN, + EXPSTART, + EXPONENT, +}; + +fn scan_number(lex: *lexer) (token | error) = { + memio::reset(&lex.strbuf); + + let state = numstate::SIGN; + for (true) { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + break; + }; + + switch (state) { + case numstate::SIGN => + state = numstate::START; + if (rn != '-') { + unget(lex, rn); + continue; + }; + case numstate::START => + switch (rn) { + case '0' => + state = numstate::ZERO; + case => + if (!ascii::isdigit(rn)) { + return lex.loc: invalid; + }; + state = numstate::INTEGER; + }; + case numstate::ZERO => + switch (rn) { + case '.' => + state = numstate::FRACSTART; + case 'e', 'E' => + state = numstate::EXPSIGN; + case => + if (ascii::isdigit(rn)) { + return lex.loc: invalid; + }; + unget(lex, rn); + break; + }; + case numstate::INTEGER => + switch (rn) { + case '.' => + state = numstate::FRACSTART; + case 'e', 'E' => + state = numstate::EXPSIGN; + case => + if (!ascii::isdigit(rn)) { + unget(lex, rn); + break; + }; + }; + case numstate::FRACSTART => + if (!ascii::isdigit(rn)) { + return lex.loc: invalid; + }; + state = numstate::FRACTION; + case numstate::FRACTION => + switch (rn) { + case 'e', 'E' => + state = numstate::EXPSIGN; + case => + if (!ascii::isdigit(rn)) { + unget(lex, rn); + break; + }; + }; + case numstate::EXPSIGN => + state = numstate::EXPSTART; + if (rn != '+' && rn != '-') { + unget(lex, rn); + continue; + }; + case numstate::EXPSTART => + if (!ascii::isdigit(rn)) { + return lex.loc: invalid; + }; + state = numstate::EXPONENT; + case numstate::EXPONENT => + if (!ascii::isdigit(rn)) { + unget(lex, rn); + break; + }; + }; + + memio::appendrune(&lex.strbuf, rn)!; + }; + + match (strconv::stof64(memio::string(&lex.strbuf)!)) { + case let f: f64 => + return f; + case => + return lex.loc: invalid; + }; +}; + +fn scan_str(lex: *lexer) (token | error) = { + memio::reset(&lex.strbuf); + + for (true) { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + lex.loc.1 += 1; + return lex.loc: invalid; + }; + + switch (rn) { + case '"' => + break; + case '\\' => + const rn = scan_escape(lex)?; + memio::appendrune(&lex.strbuf, rn)!; + case => + if (iscntrl(rn)) { + return lex.loc: invalid; + }; + memio::appendrune(&lex.strbuf, rn)!; + }; + }; + + return memio::string(&lex.strbuf)!; +}; + +fn scan_escape(lex: *lexer) (rune | error) = { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + return lex.loc: invalid; + }; + + switch (rn) { + case '\"' => + return '\"'; + case '\\' => + return '\\'; + case '/' => + return '/'; + case 'b' => + return '\b'; + case 'f' => + return '\f'; + case 'n' => + return '\n'; + case 'r' => + return '\r'; + case 't' => + return '\t'; + case 'u' => + const u = scan_escape_codepoint(lex)?; + + if (u >= 0xd800 && u <= 0xdfff) { + if (u >= 0xdc00) { + return lex.loc: invalid; + }; + + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + return lex.loc: invalid; + }; + if (rn != '\\') { + return lex.loc: invalid; + }; + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + return lex.loc: invalid; + }; + if (rn != 'u') { + return lex.loc: invalid; + }; + + const v = scan_escape_codepoint(lex)?; + if (v < 0xdc00 || v > 0xdfff) { + return lex.loc: invalid; + }; + + const hi = u: u32 & 0x03ff; + const lo = v: u32 & 0x03ff; + return ((hi >> 10 | lo) + 0x10000): rune; + }; + + return u: u32: rune; + case => + return lex.loc: invalid; + }; +}; + +fn scan_escape_codepoint(lex: *lexer) (u16 | error) = { + let buf: [4]u8 = [0...]; + match (io::readall(lex.src, buf)?) { + case io::EOF => + return lex.loc: invalid; + case size => + yield; + }; + const s = match (strings::fromutf8(buf)) { + case let s: str => + yield s; + case => + return lex.loc: invalid; + }; + match (strconv::stou16(s, strconv::base::HEX)) { + case let u: u16 => + lex.loc.1 += 4; + return u; + case => + return lex.loc: invalid; + }; +}; + +// Gets the next rune from the lexer. +fn nextrune(lex: *lexer) (rune | io::EOF | error) = { + if (lex.rb is rune) { + lex.prevrloc = lex.loc; + const r = lex.rb as rune; + lex.rb = void; + if (r == '\n') { + lex.loc = (lex.loc.0 + 1, 0); + } else { + lex.loc.1 += 1; + }; + return r; + }; + match (bufio::read_rune(lex.src)) { + case let err: io::error => + return err; + case utf8::invalid => + return lex.loc: invalid; + case io::EOF => + return io::EOF; + case let rn: rune => + lex.prevrloc = lex.loc; + if (rn == '\n') { + lex.loc = (lex.loc.0 + 1, 0); + } else { + lex.loc.1 += 1; + }; + return rn; + }; +}; + +// Like nextrune but skips whitespace. +fn nextrunews(lex: *lexer) (rune | io::EOF | error) = { + for (true) { + match (nextrune(lex)?) { + case let rn: rune => + if (isspace(rn)) { + continue; + }; + return rn; + case io::EOF => + return io::EOF; + }; + }; +}; + +fn unget(lex: *lexer, r: rune) void = { + assert(lex.rb is void); + lex.rb = r; + lex.loc = lex.prevrloc; +}; + +fn iscntrl(r: rune) bool = r: u32 < 0x20; + +fn isspace(r: rune) bool = ascii::isspace(r) && r != '\f'; |