Alternatives to Functional Parsing

I appreciate that Zig makes functional programming possible, but awkward. It makes you think twice about what you’re doing.

During Advent of Code, I wrote a small parsing library using a functional parser, because that’s the only way I know how to, but I’m wondering if there’s a better way in Zig that doesn’t involve writing each parser as its own, non-reusable function.

i.e. Is this code terrible?

In a perfect world, the “sub-parsers” would get inlined at comptime, so you’d just have one output function doing all the heavy lifting, but I image that’s not entirely possible.

Here is one of the completed parsers:

/// Parse `mul(x, y)`, where x and y are unsigned integers
// zig fmt: off
const parse_mul: parse.Parser(struct { []u8, []u8 }) = parse.preceded(
    []const u8,
    struct { []u8, []u8 },
    parse.tag("mul("),
    parse.terminated(
        struct { []u8, []u8 },
        void,
        parse.delimited(
            []u8,
            void,
            []u8,
            parse.take_while_digit,
            parse.skip(','),
            parse.take_while_digit,
        ),
        parse.skip(')')
    )
);

fn mul(a: []u8, b: []u8) !u32 {
    const a_val = try std.fmt.parseInt(u32, a, 10);
    const b_val = try std.fmt.parseInt(u32, b, 10);

    return a_val * b_val;
}

And the library it uses:

// parse.zig

const std = @import("std");

pub const ParseError = error{
    /// The parser could succeed given more input, but ran out.
    Incomplete,
    /// An unrecoverable failure has occurred.
    Panic,
};

pub fn Parser(comptime T: type) type {
    return fn ([]u8) ParseError!struct { T, []u8 };
}

/// Match a string of one or more characters.
pub fn tag(comptime pattern: []const u8) Parser([]const u8) {
    const s = struct {
        fn invoke(rest: []u8) ParseError!struct { []const u8, []u8 } {
            if (rest.len < pattern.len) {
                return ParseError.Incomplete;
            } else {
                for (pattern, 0..) |char, j| {
                    if (rest[j] != char) {
                        return ParseError.Panic;
                    }
                } else {
                    return .{ pattern, rest[pattern.len..] };
                }
            }
        }
    };

    return s.invoke;
}

/// Look for the given character, and then discard it.
pub fn skip(comptime char: u8) Parser(void) {
    const s = struct {
        pub fn invoke(rest: []u8) ParseError!struct { void, []u8 } {
            if (rest.len < 1) {
                return ParseError.Incomplete;
            } else {
                if (rest[0] == char) {
                    return .{ {}, rest[1..] };
                } else {
                    return ParseError.Panic;
                }
            }
        }
    };

    return s.invoke;
}

/// Parse a u32 from a string, returns as i32 for ease of use
pub fn int(
    rest: []u8,
) !struct { i32, []u8 } {
    const results = try take_while_digit(rest);

    return .{ try std.fmt.parseInt(i32, results[0], 10), results[1] };
}

/// Parse any number of consecutive (base 10) digits
pub fn take_while_digit(rest: []u8) ParseError!struct { []u8, []u8 } {
    if (rest.len > 1) {
        var index: u32 = 0;
        while (index < rest.len and is_digit(rest[index])) {
            index += 1;
        }

        if (index == 0) {
            return ParseError.Panic;
        } else {
            return .{ rest[0..index], rest[index..] };
        }
    } else {
        if (rest.len == 0) {
            return ParseError.Incomplete;
        } else {
            if (is_digit(rest[0])) {
                return .{ rest[0..1], &[_]u8{} };
            } else {
                return .{ &[_]u8{}, rest };
            }
        }
    }
}

test "take_while_digit" {
    const input = "837.152";
    const result = take_while_digit(@constCast(input));
    try std.testing.expectEqualSlices(u8, "837", result[0]);
    try std.testing.expectEqualSlices(u8, ".152", result[1]);
}

pub fn is_digit(char: u64) bool {
    return char > '/' and char < ':';
}

test "is_digit" {
    try std.testing.expectEqual('0', 48);
    try std.testing.expectEqual('9', 57);

    try std.testing.expect(is_digit('0'));
    try std.testing.expect(is_digit('1'));
    try std.testing.expect(is_digit('2'));
    try std.testing.expect(is_digit('3'));
    try std.testing.expect(is_digit('4'));
    try std.testing.expect(is_digit('5'));
    try std.testing.expect(is_digit('6'));
    try std.testing.expect(is_digit('7'));
    try std.testing.expect(is_digit('8'));
    try std.testing.expect(is_digit('9'));

    try std.testing.expect(!is_digit('a'));
}

// Discard all output until a digit is found.
fn skip_until_digit(rest: []u8) ParseError!struct { void, []u8 } { // u64 {
    var index: u32 = 0;
    while (index < rest.len) : (index += 1) {
        if (is_digit(rest[index])) {
            if (index > 0) {
                return .{ {}, rest[index..] };
            } else {
                return .Incomplete;
            }
        }
    }

    return .{ {}, &[_]u8{} };
}

/// Discard the results of the first parser, then return the results of the
/// second parser.
pub fn preceded(
    comptime T: type,
    comptime U: type,
    drop: Parser(T),
    keep: Parser(U),
) Parser(U) {
    const s = struct {
        pub fn invoke(rest: []u8) ParseError!struct { U, []u8 } {
            _, const remaining = try drop(rest);

            return keep(remaining);
        }
    };

    return s.invoke;
}

/// Return the results of the first parser, then discard the results of the
/// second parser.
pub fn terminated(
    comptime T: type,
    comptime U: type,
    keep: Parser(T),
    drop: Parser(U),
) Parser(T) {
    const s = struct {
        pub fn invoke(rest: []u8) ParseError!struct { T, []u8 } {
            const result, var remaining = try keep(rest);
            _, remaining = try drop(remaining);

            return .{ result, remaining };
        }
    };

    return s.invoke;
}

/// Return the results of left and right, discarding the middle.
pub fn delimited(
    comptime T: type,
    comptime U: type,
    comptime V: type,
    left: Parser(T),
    middle: Parser(U),
    right: Parser(V),
) Parser(struct { T, V }) {
    const s = struct {
        pub fn invoke(rest: []u8) ParseError!struct { struct { T, V }, []u8 } {
            var remaining: []u8 = rest;

            const left_out, remaining = try left(remaining);
            _, remaining = try middle(remaining);
            const right_out, remaining = try right(remaining);

            return .{ .{ left_out, right_out }, remaining };
        }
    };

    return s.invoke;
}

The only nitpick I have with your code is the amount of non const slices, as it forces at least one allocation at the beginning. Not an issue, since you’d do that anyway when getting the parser input externally.

I think any non-trivial parser will have reusable sub parsers, so I’m not sure what you’re asking.

The parsers will probably be inlined, at least the simple ones will be, this is because you are using the fn ... types which are comptime known, a runtime known function would be *const fn ....
Though you should definitely check the assembly and force inline trivial parses if they aren’t already.

I prefer using objdump in the terminal though as it doesn’t require exporting functions which is a real pain since you have to make them compatible with the c abi
just need to export a wrapper function that uses what you want to inspect, otherwise zig will remove it as its unused.

4 Likes

In my experience, the simplest & most maintainable way to write a parser is to just directly write the parser’s code by hand. I have two relevant tutorials (in Rust, but they use zero language features):

I wouldn’t go as far as saying that hand-writing the parser is always a solution, but I do highly recommend learning parsing “by hand” before learning about parser combinators or parser generators, to have a more complete picture of available options. The education tends to start with Yacc and such, and I think this is pedagogically confusing (I was certainly confused about parsing a lot until relatively late in my career).

6 Likes

:eyes: Wow, an actual code review- Thank You!

I’ve done very little work with low-level languages before, so I didn’t even know that was an option :slight_smile:

I figured functional programming and function composition are inherently inefficient to run, hence why Zig “doesn’t have closures”, and makes it so awkward for you to do FP. When I tried to think about how to lower the function count, the only thing I could think to do was manually inline all the functions myself, as I have no idea what the compiler can and can’t inline. But it sounds like maybe I don’t need to be as worried about it as I am.

Thank you for the detailed response! I never set out to learn parsers specifically, I just heard about parser combinators while I was tinkering with Haskell, and they piqued my interest.

Thanks for showing me a more object-oriented approach. I personally find the functional style more intuitive, but the next time I have to write a parser I will definitely come back to your articles so I can experience both methods.

It’s definitely not object-oriented! Just old-school imperative.

1 Like

My apologies, I realize that can be a loaded term, and no offense was intended!

In my mind, (good) OOP is about methods and private properties, not inheritance. But that’s a whole 'nother discussion :sweat_smile:

I personally just use simple, single-pass, top down recursive parsers, since they are easy to write and easy to understand, it’s all just plain and simple procedural code, no magic abstractions.

In your case it would be something like this pseudo code:

fn parseMul(stream) {
    const name = parseIdentifier(stream);
    if(name != "mul") return error...;
    stream.skipWhitespaces();

    if(stream.next() != '(') return error...;
    stream.skipWhitespaces();

    const first = parseInt(stream);
    stream.skipWhitespaces();

    if(stream.next() != ',') return error...;
    stream.skipWhitespaces();

    const second = parseInt(stream);
    if(stream.next() != ')') return error...;

    return .{first, second};
}

fn parseIdentifier(stream) {
    const buf = stream.remaining;
    var len = buf.len;
    for(buf, 0..) |c, i| {
        if(!isAlphanum(c) && c != '_') {
            len = i;
            break;
        }
    }
    stream.advance(len);
    return buf[0..len];
}

fn parseInt(stream) {
    var value: usize = 0;
    while(true) {
        if(!isDigit(stream.peek())) {
            len = i;
            break;
        }
        value = value*10 + (stream.next() - '0');
    }
    return value;
}

The only really annoying thing with this approach is that you have to skip the whitespaces everywhere. But that could be easily solved by just using a tokenizer first.

2 Likes

I agree with this, and just want to add a caveat: things are different if one is also inventing the grammar which is being parsed. In that case I suggest supplementing with formal methods.

It’s very easy to write a parser which ad-hoc describes a language with various problems, ones which only show up when someone else tries to parse it with new code, or worse, when an adversary studies the parser implementation, finds a weird machine in it, and crafts an input which can lead to privilege escalation.

The way to mitigate that is to evolve the grammar using a parsing tool which bases the code on that grammar.

If the format already exists, then hand-rolling has a lot to offer in the way of control. Someone else has already confirmed that the format has the necessary properties, so the extra hassle and abstraction isn’t necessarily worth it. Even so I suggest fuzzing it intensely, parsers are a great way to let malicious actors get inside the rest of the program.

1 Like

Would you mind elaborating on this? By “formal methods”, do you mean defining a grammer using ABNF or something else entirely?