summaryrefslogtreecommitdiff
path: root/encoding/json/lex.ha
diff options
context:
space:
mode:
authorLassi Pulkkinen <lassi@pulk.fi>2024-10-31 03:11:21 +0200
committerLassi Pulkkinen <lassi@pulk.fi>2024-10-31 03:51:35 +0200
commitae44478b30d890fe0fb04022f44d474dcdcc3f9d (patch)
tree5f462459ae4b47d22114eed717d1382d08cf4dfe /encoding/json/lex.ha
Initial commit (import old repo)HEADmain
Diffstat (limited to 'encoding/json/lex.ha')
-rw-r--r--encoding/json/lex.ha417
1 files changed, 417 insertions, 0 deletions
diff --git a/encoding/json/lex.ha b/encoding/json/lex.ha
new file mode 100644
index 0000000..7b9bf12
--- /dev/null
+++ b/encoding/json/lex.ha
@@ -0,0 +1,417 @@
+// License: MPL-2.0
+// (c) 2022 Drew DeVault <sir@cmpwn.com>
+use ascii;
+use bufio;
+use encoding::utf8;
+use io;
+use os;
+use strconv;
+use strings;
+use memio;
+
+export type lexer = struct {
+ src: io::handle,
+ strbuf: memio::stream,
+ un: (token | void),
+ rb: (rune | void),
+ loc: (uint, uint),
+ prevloc: (uint, uint),
+ nextloc: (uint, uint),
+ prevrloc: (uint, uint),
+};
+
+// Creates a new JSON lexer. The caller may obtain tokens with [[lex]] and
+// should pass the result to [[close]] when they're done with it.
+export fn newlexer(src: io::handle) lexer = lexer {
+ src = src,
+ strbuf = memio::dynamic(),
+ un = void,
+ rb = void,
+ loc = (1, 0),
+ ...
+};
+
+// Frees state associated with a JSON lexer.
+export fn close(lex: *lexer) void = {
+ io::close(&lex.strbuf)!;
+};
+
+// Returns the next token from a JSON lexer. The return value is borrowed from
+// the lexer and will be overwritten on subsequent calls.
+export fn lex(lex: *lexer) (token | io::EOF | error) = {
+ match (lex.un) {
+ case void =>
+ lex.prevloc = lex.loc;
+ case let tok: token =>
+ lex.un = void;
+ lex.prevloc = lex.loc;
+ lex.loc = lex.nextloc;
+ return tok;
+ };
+
+ const rn = match (nextrunews(lex)?) {
+ case io::EOF =>
+ return io::EOF;
+ case let rn: rune =>
+ yield rn;
+ };
+
+ switch (rn) {
+ case '[' =>
+ return arraystart;
+ case ']' =>
+ return arrayend;
+ case '{' =>
+ return objstart;
+ case '}' =>
+ return objend;
+ case ',' =>
+ return comma;
+ case ':' =>
+ return colon;
+ case '"' =>
+ return scan_str(lex)?;
+ case =>
+ yield;
+ };
+
+ if (ascii::isdigit(rn) || rn == '-') {
+ unget(lex, rn);
+ return scan_number(lex)?;
+ };
+
+ if (!ascii::isalpha(rn)) {
+ return lex.loc: invalid;
+ };
+
+ unget(lex, rn);
+ const word = scan_word(lex)?;
+ switch (word) {
+ case "true" =>
+ return true;
+ case "false" =>
+ return false;
+ case "null" =>
+ return _null;
+ case =>
+ return lex.loc: invalid;
+ };
+};
+
+// "Unlexes" a token from the lexer, such that the next call to [[lex]] will
+// return that token again. Only one token can be unlexed at a time, otherwise
+// the program will abort.
+export fn unlex(lex: *lexer, tok: token) void = {
+ assert(lex.un is void, "encoding::json::unlex called twice in a row");
+ lex.un = tok;
+ lex.nextloc = lex.loc;
+ lex.loc = lex.prevloc;
+};
+
+// Scans until encountering a non-alphabetical character, returning the
+// resulting word.
+fn scan_word(lex: *lexer) (str | error) = {
+ memio::reset(&lex.strbuf);
+
+ for (true) {
+ const rn = match (nextrune(lex)?) {
+ case let rn: rune =>
+ yield rn;
+ case io::EOF =>
+ break;
+ };
+ if (!ascii::isalpha(rn)) {
+ unget(lex, rn);
+ break;
+ };
+ memio::appendrune(&lex.strbuf, rn)!;
+ };
+
+ return memio::string(&lex.strbuf)!;
+};
+
+type numstate = enum {
+ SIGN,
+ START,
+ ZERO,
+ INTEGER,
+ FRACSTART,
+ FRACTION,
+ EXPSIGN,
+ EXPSTART,
+ EXPONENT,
+};
+
+fn scan_number(lex: *lexer) (token | error) = {
+ memio::reset(&lex.strbuf);
+
+ let state = numstate::SIGN;
+ for (true) {
+ const rn = match (nextrune(lex)?) {
+ case let rn: rune =>
+ yield rn;
+ case io::EOF =>
+ break;
+ };
+
+ switch (state) {
+ case numstate::SIGN =>
+ state = numstate::START;
+ if (rn != '-') {
+ unget(lex, rn);
+ continue;
+ };
+ case numstate::START =>
+ switch (rn) {
+ case '0' =>
+ state = numstate::ZERO;
+ case =>
+ if (!ascii::isdigit(rn)) {
+ return lex.loc: invalid;
+ };
+ state = numstate::INTEGER;
+ };
+ case numstate::ZERO =>
+ switch (rn) {
+ case '.' =>
+ state = numstate::FRACSTART;
+ case 'e', 'E' =>
+ state = numstate::EXPSIGN;
+ case =>
+ if (ascii::isdigit(rn)) {
+ return lex.loc: invalid;
+ };
+ unget(lex, rn);
+ break;
+ };
+ case numstate::INTEGER =>
+ switch (rn) {
+ case '.' =>
+ state = numstate::FRACSTART;
+ case 'e', 'E' =>
+ state = numstate::EXPSIGN;
+ case =>
+ if (!ascii::isdigit(rn)) {
+ unget(lex, rn);
+ break;
+ };
+ };
+ case numstate::FRACSTART =>
+ if (!ascii::isdigit(rn)) {
+ return lex.loc: invalid;
+ };
+ state = numstate::FRACTION;
+ case numstate::FRACTION =>
+ switch (rn) {
+ case 'e', 'E' =>
+ state = numstate::EXPSIGN;
+ case =>
+ if (!ascii::isdigit(rn)) {
+ unget(lex, rn);
+ break;
+ };
+ };
+ case numstate::EXPSIGN =>
+ state = numstate::EXPSTART;
+ if (rn != '+' && rn != '-') {
+ unget(lex, rn);
+ continue;
+ };
+ case numstate::EXPSTART =>
+ if (!ascii::isdigit(rn)) {
+ return lex.loc: invalid;
+ };
+ state = numstate::EXPONENT;
+ case numstate::EXPONENT =>
+ if (!ascii::isdigit(rn)) {
+ unget(lex, rn);
+ break;
+ };
+ };
+
+ memio::appendrune(&lex.strbuf, rn)!;
+ };
+
+ match (strconv::stof64(memio::string(&lex.strbuf)!)) {
+ case let f: f64 =>
+ return f;
+ case =>
+ return lex.loc: invalid;
+ };
+};
+
+fn scan_str(lex: *lexer) (token | error) = {
+ memio::reset(&lex.strbuf);
+
+ for (true) {
+ const rn = match (nextrune(lex)?) {
+ case let rn: rune =>
+ yield rn;
+ case io::EOF =>
+ lex.loc.1 += 1;
+ return lex.loc: invalid;
+ };
+
+ switch (rn) {
+ case '"' =>
+ break;
+ case '\\' =>
+ const rn = scan_escape(lex)?;
+ memio::appendrune(&lex.strbuf, rn)!;
+ case =>
+ if (iscntrl(rn)) {
+ return lex.loc: invalid;
+ };
+ memio::appendrune(&lex.strbuf, rn)!;
+ };
+ };
+
+ return memio::string(&lex.strbuf)!;
+};
+
+fn scan_escape(lex: *lexer) (rune | error) = {
+ const rn = match (nextrune(lex)?) {
+ case let rn: rune =>
+ yield rn;
+ case io::EOF =>
+ return lex.loc: invalid;
+ };
+
+ switch (rn) {
+ case '\"' =>
+ return '\"';
+ case '\\' =>
+ return '\\';
+ case '/' =>
+ return '/';
+ case 'b' =>
+ return '\b';
+ case 'f' =>
+ return '\f';
+ case 'n' =>
+ return '\n';
+ case 'r' =>
+ return '\r';
+ case 't' =>
+ return '\t';
+ case 'u' =>
+ const u = scan_escape_codepoint(lex)?;
+
+ if (u >= 0xd800 && u <= 0xdfff) {
+ if (u >= 0xdc00) {
+ return lex.loc: invalid;
+ };
+
+ const rn = match (nextrune(lex)?) {
+ case let rn: rune =>
+ yield rn;
+ case io::EOF =>
+ return lex.loc: invalid;
+ };
+ if (rn != '\\') {
+ return lex.loc: invalid;
+ };
+ const rn = match (nextrune(lex)?) {
+ case let rn: rune =>
+ yield rn;
+ case io::EOF =>
+ return lex.loc: invalid;
+ };
+ if (rn != 'u') {
+ return lex.loc: invalid;
+ };
+
+ const v = scan_escape_codepoint(lex)?;
+ if (v < 0xdc00 || v > 0xdfff) {
+ return lex.loc: invalid;
+ };
+
+ const hi = u: u32 & 0x03ff;
+ const lo = v: u32 & 0x03ff;
+ return ((hi >> 10 | lo) + 0x10000): rune;
+ };
+
+ return u: u32: rune;
+ case =>
+ return lex.loc: invalid;
+ };
+};
+
+fn scan_escape_codepoint(lex: *lexer) (u16 | error) = {
+ let buf: [4]u8 = [0...];
+ match (io::readall(lex.src, buf)?) {
+ case io::EOF =>
+ return lex.loc: invalid;
+ case size =>
+ yield;
+ };
+ const s = match (strings::fromutf8(buf)) {
+ case let s: str =>
+ yield s;
+ case =>
+ return lex.loc: invalid;
+ };
+ match (strconv::stou16(s, strconv::base::HEX)) {
+ case let u: u16 =>
+ lex.loc.1 += 4;
+ return u;
+ case =>
+ return lex.loc: invalid;
+ };
+};
+
+// Gets the next rune from the lexer.
+fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
+ if (lex.rb is rune) {
+ lex.prevrloc = lex.loc;
+ const r = lex.rb as rune;
+ lex.rb = void;
+ if (r == '\n') {
+ lex.loc = (lex.loc.0 + 1, 0);
+ } else {
+ lex.loc.1 += 1;
+ };
+ return r;
+ };
+ match (bufio::read_rune(lex.src)) {
+ case let err: io::error =>
+ return err;
+ case utf8::invalid =>
+ return lex.loc: invalid;
+ case io::EOF =>
+ return io::EOF;
+ case let rn: rune =>
+ lex.prevrloc = lex.loc;
+ if (rn == '\n') {
+ lex.loc = (lex.loc.0 + 1, 0);
+ } else {
+ lex.loc.1 += 1;
+ };
+ return rn;
+ };
+};
+
+// Like nextrune but skips whitespace.
+fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
+ for (true) {
+ match (nextrune(lex)?) {
+ case let rn: rune =>
+ if (isspace(rn)) {
+ continue;
+ };
+ return rn;
+ case io::EOF =>
+ return io::EOF;
+ };
+ };
+};
+
+fn unget(lex: *lexer, r: rune) void = {
+ assert(lex.rb is void);
+ lex.rb = r;
+ lex.loc = lex.prevrloc;
+};
+
+fn iscntrl(r: rune) bool = r: u32 < 0x20;
+
+fn isspace(r: rune) bool = ascii::isspace(r) && r != '\f';