Alternatives to Functional Parsing

I appreciate that Zig makes functional programming possible, but awkward. It makes you think twice about what you’re doing.

During Advent of Code, I wrote a small parsing library using a functional parser, because that’s the only way I know how to, but I’m wondering if there’s a better way in Zig that doesn’t involve writing each parser as its own, non-reusable function.

i.e. Is this code terrible?

In a perfect world, the “sub-parsers” would get inlined at comptime, so you’d just have one output function doing all the heavy lifting, but I image that’s not entirely possible.

Here is one of the completed parsers:

/// Parse `mul(x, y)`, where x and y are unsigned integers
// zig fmt: off
const parse_mul: parse.Parser(struct { []u8, []u8 }) = parse.preceded(
    []const u8,
    struct { []u8, []u8 },
    parse.tag("mul("),
    parse.terminated(
        struct { []u8, []u8 },
        void,
        parse.delimited(
            []u8,
            void,
            []u8,
            parse.take_while_digit,
            parse.skip(','),
            parse.take_while_digit,
        ),
        parse.skip(')')
    )
);

fn mul(a: []u8, b: []u8) !u32 {
    const a_val = try std.fmt.parseInt(u32, a, 10);
    const b_val = try std.fmt.parseInt(u32, b, 10);

    return a_val * b_val;
}

And the library it uses:

// parse.zig

const std = @import("std");

pub const ParseError = error{
    /// The parser could succeed given more input, but ran out.
    Incomplete,
    /// An unrecoverable failure has occurred.
    Panic,
};

pub fn Parser(comptime T: type) type {
    return fn ([]u8) ParseError!struct { T, []u8 };
}

/// Match a string of one or more characters.
pub fn tag(comptime pattern: []const u8) Parser([]const u8) {
    const s = struct {
        fn invoke(rest: []u8) ParseError!struct { []const u8, []u8 } {
            if (rest.len < pattern.len) {
                return ParseError.Incomplete;
            } else {
                for (pattern, 0..) |char, j| {
                    if (rest[j] != char) {
                        return ParseError.Panic;
                    }
                } else {
                    return .{ pattern, rest[pattern.len..] };
                }
            }
        }
    };

    return s.invoke;
}

/// Look for the given character, and then discard it.
pub fn skip(comptime char: u8) Parser(void) {
    const s = struct {
        pub fn invoke(rest: []u8) ParseError!struct { void, []u8 } {
            if (rest.len < 1) {
                return ParseError.Incomplete;
            } else {
                if (rest[0] == char) {
                    return .{ {}, rest[1..] };
                } else {
                    return ParseError.Panic;
                }
            }
        }
    };

    return s.invoke;
}

/// Parse a u32 from a string, returns as i32 for ease of use
pub fn int(
    rest: []u8,
) !struct { i32, []u8 } {
    const results = try take_while_digit(rest);

    return .{ try std.fmt.parseInt(i32, results[0], 10), results[1] };
}

/// Parse any number of consecutive (base 10) digits
pub fn take_while_digit(rest: []u8) ParseError!struct { []u8, []u8 } {
    if (rest.len > 1) {
        var index: u32 = 0;
        while (index < rest.len and is_digit(rest[index])) {
            index += 1;
        }

        if (index == 0) {
            return ParseError.Panic;
        } else {
            return .{ rest[0..index], rest[index..] };
        }
    } else {
        if (rest.len == 0) {
            return ParseError.Incomplete;
        } else {
            if (is_digit(rest[0])) {
                return .{ rest[0..1], &[_]u8{} };
            } else {
                return .{ &[_]u8{}, rest };
            }
        }
    }
}

test "take_while_digit" {
    const input = "837.152";
    const result = take_while_digit(@constCast(input));
    try std.testing.expectEqualSlices(u8, "837", result[0]);
    try std.testing.expectEqualSlices(u8, ".152", result[1]);
}

pub fn is_digit(char: u64) bool {
    return char > '/' and char < ':';
}

test "is_digit" {
    try std.testing.expectEqual('0', 48);
    try std.testing.expectEqual('9', 57);

    try std.testing.expect(is_digit('0'));
    try std.testing.expect(is_digit('1'));
    try std.testing.expect(is_digit('2'));
    try std.testing.expect(is_digit('3'));
    try std.testing.expect(is_digit('4'));
    try std.testing.expect(is_digit('5'));
    try std.testing.expect(is_digit('6'));
    try std.testing.expect(is_digit('7'));
    try std.testing.expect(is_digit('8'));
    try std.testing.expect(is_digit('9'));

    try std.testing.expect(!is_digit('a'));
}

// Discard all output until a digit is found.
fn skip_until_digit(rest: []u8) ParseError!struct { void, []u8 } { // u64 {
    var index: u32 = 0;
    while (index < rest.len) : (index += 1) {
        if (is_digit(rest[index])) {
            if (index > 0) {
                return .{ {}, rest[index..] };
            } else {
                return .Incomplete;
            }
        }
    }

    return .{ {}, &[_]u8{} };
}

/// Discard the results of the first parser, then return the results of the
/// second parser.
pub fn preceded(
    comptime T: type,
    comptime U: type,
    drop: Parser(T),
    keep: Parser(U),
) Parser(U) {
    const s = struct {
        pub fn invoke(rest: []u8) ParseError!struct { U, []u8 } {
            _, const remaining = try drop(rest);

            return keep(remaining);
        }
    };

    return s.invoke;
}

/// Return the results of the first parser, then discard the results of the
/// second parser.
pub fn terminated(
    comptime T: type,
    comptime U: type,
    keep: Parser(T),
    drop: Parser(U),
) Parser(T) {
    const s = struct {
        pub fn invoke(rest: []u8) ParseError!struct { T, []u8 } {
            const result, var remaining = try keep(rest);
            _, remaining = try drop(remaining);

            return .{ result, remaining };
        }
    };

    return s.invoke;
}

/// Return the results of left and right, discarding the middle.
pub fn delimited(
    comptime T: type,
    comptime U: type,
    comptime V: type,
    left: Parser(T),
    middle: Parser(U),
    right: Parser(V),
) Parser(struct { T, V }) {
    const s = struct {
        pub fn invoke(rest: []u8) ParseError!struct { struct { T, V }, []u8 } {
            var remaining: []u8 = rest;

            const left_out, remaining = try left(remaining);
            _, remaining = try middle(remaining);
            const right_out, remaining = try right(remaining);

            return .{ .{ left_out, right_out }, remaining };
        }
    };

    return s.invoke;
}

The only nitpick I have with your code is the amount of non const slices, as it forces at least one allocation at the beginning. Not an issue, since you’d do that anyway when getting the parser input externally.

I think any non-trivial parser will have reusable sub parsers, so I’m not sure what you’re asking.

The parsers will probably be inlined, at least the simple ones will be, this is because you are using the fn ... types which are comptime known, a runtime known function would be *const fn ....
Though you should definitely check the assembly and force inline trivial parses if they aren’t already.

I prefer using objdump in the terminal though as it doesn’t require exporting functions which is a real pain since you have to make them compatible with the c abi
just need to export a wrapper function that uses what you want to inspect, otherwise zig will remove it as its unused.

4 Likes

In my experience, the simplest & most maintainable way to write a parser is to just directly write the parser’s code by hand. I have two relevant tutorials (in Rust, but they use zero language features):

I wouldn’t go as far as saying that hand-writing the parser is always a solution, but I do highly recommend learning parsing “by hand” before learning about parser combinators or parser generators, to have a more complete picture of available options. The education tends to start with Yacc and such, and I think this is pedagogically confusing (I was certainly confused about parsing a lot until relatively late in my career).

7 Likes

:eyes: Wow, an actual code review- Thank You!

I’ve done very little work with low-level languages before, so I didn’t even know that was an option :slight_smile:

I figured functional programming and function composition are inherently inefficient to run, hence why Zig “doesn’t have closures”, and makes it so awkward for you to do FP. When I tried to think about how to lower the function count, the only thing I could think to do was manually inline all the functions myself, as I have no idea what the compiler can and can’t inline. But it sounds like maybe I don’t need to be as worried about it as I am.

Thank you for the detailed response! I never set out to learn parsers specifically, I just heard about parser combinators while I was tinkering with Haskell, and they piqued my interest.

Thanks for showing me a more object-oriented approach. I personally find the functional style more intuitive, but the next time I have to write a parser I will definitely come back to your articles so I can experience both methods.

It’s definitely not object-oriented! Just old-school imperative.

1 Like

My apologies, I realize that can be a loaded term, and no offense was intended!

In my mind, (good) OOP is about methods and private properties, not inheritance. But that’s a whole 'nother discussion :sweat_smile:

I personally just use simple, single-pass, top down recursive parsers, since they are easy to write and easy to understand, it’s all just plain and simple procedural code, no magic abstractions.

In your case it would be something like this pseudo code:

fn parseMul(stream) {
    const name = parseIdentifier(stream);
    if(name != "mul") return error...;
    stream.skipWhitespaces();

    if(stream.next() != '(') return error...;
    stream.skipWhitespaces();

    const first = parseInt(stream);
    stream.skipWhitespaces();

    if(stream.next() != ',') return error...;
    stream.skipWhitespaces();

    const second = parseInt(stream);
    if(stream.next() != ')') return error...;

    return .{first, second};
}

fn parseIdentifier(stream) {
    const buf = stream.remaining;
    var len = buf.len;
    for(buf, 0..) |c, i| {
        if(!isAlphanum(c) && c != '_') {
            len = i;
            break;
        }
    }
    stream.advance(len);
    return buf[0..len];
}

fn parseInt(stream) {
    var value: usize = 0;
    while(true) {
        if(!isDigit(stream.peek())) {
            len = i;
            break;
        }
        value = value*10 + (stream.next() - '0');
    }
    return value;
}

The only really annoying thing with this approach is that you have to skip the whitespaces everywhere. But that could be easily solved by just using a tokenizer first.

2 Likes

I agree with this, and just want to add a caveat: things are different if one is also inventing the grammar which is being parsed. In that case I suggest supplementing with formal methods.

It’s very easy to write a parser which ad-hoc describes a language with various problems, ones which only show up when someone else tries to parse it with new code, or worse, when an adversary studies the parser implementation, finds a weird machine in it, and crafts an input which can lead to privilege escalation.

The way to mitigate that is to evolve the grammar using a parsing tool which bases the code on that grammar.

If the format already exists, then hand-rolling has a lot to offer in the way of control. Someone else has already confirmed that the format has the necessary properties, so the extra hassle and abstraction isn’t necessarily worth it. Even so I suggest fuzzing it intensely, parsers are a great way to let malicious actors get inside the rest of the program.

1 Like

Would you mind elaborating on this? By “formal methods”, do you mean defining a grammer using ABNF or something else entirely?

1 Like

I am not @mnemnion but I will try to answer. Creating a new grammar is akin to writing a new program (but in a different language). You can then “compile” your program (implement the grammar) manually, or you can use a “compiler” for this (yacc / bison, lex / flex). These tools have advantages and disadvantages over doing things manually; in my mind, foremost among the advantages is the possibility of validating how ambiguous your grammar is (think shift-reduce and reduce-reduce / reduce-accept conflicts). Dealing with these conflicts may be a pain in the ass, but knowing they exist is invaluable.

2 Likes

Thank you, that sounds like a perfect answer.

@gonzo got this right: anything which uses a ‘grammatical’ format to define a parser is employing formal methods. I would add that, in addition to the ambiguity problems he mentions, writing a context dependent† language (like C’s typedef problem) is quite easy, and results in semantic aspects of the language leaking into the parsing of it.

Zig, for example, uses a Parsing Expression Grammar to define the grammar of the language, which I consider the best option (this is a minority point of view, I must add).

One could use something like LPEG to develop the grammar, and a corpus for conformational testing, and then write a recursive-descent parser (with Pratt characteristics!) for use in the program itself. The close semantic match between PEG parsing and recursive descent makes this particularly pleasant.

But really, anything which generates the parser from a formal specification will do the trick.

† Not the same as a context sensitive grammar, while those can be difficult to implement they don’t pose the layer-abstraction problems of context dependence.

2 Likes