How could I improve my parsing?

PasVegan · February 16, 2024, 3:53pm

Hello,

I’ve recently completed a mathematics project in Zig, aimed at calculating probabilities in the game of Yams. Now that it’s fully operational and yielding the expected results, I’m looking to refine and improve it. This project marks my inaugural journey with Zig, and I’m particularly interested in enhancing the parsing aspect.

The program currently parses command line arguments to appropriately supply values to the computation functions. I’m seeking advice on how to make this parsing process more efficient or elegant. Any insights or suggestions from the community would be greatly appreciated!

Thank you in advance for your time and help.

here are some usage example:

./201yams 0 0 0 0 0 yams_4
Chances to get a 4 yams: 0.01%

./201yams 1 2 3 4 5 four_4
Chances to get a 4 four-of-a-kind: 1.62%

./201yams 2 2 5 4 6 straight_6
Chances to get a 6 straight: 16.67%

./201yams 0 0 0 0 0 full_2_3
Chances to get a 2 full of 3: 0.13%

./201yams 2 3 2 3 2 full_2_3
Chances to get a 2 full of 3: 100.00%

here is the code (yes it’s long)

var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    const allocator = gpa.allocator();
    defer _ = gpa.deinit();

    const args = std.process.argsAlloc(allocator) catch {
        std.debug.print("failed to allocate args\n", .{});
        return 84;
    };
    defer std.process.argsFree(allocator, args);

    if (args.len != 2 and args.len != 7) {
        std.debug.print("wrong number of arguments\n", .{});
        return 84;
    }

    if (std.mem.eql(u8, args[1], "-h")) {
        const help = "USAGE\n\t./201yams d1 d2 d3 d4 d5 c\n\nDESCRIPTION\n\td1\tvalue of the first die (0 if not thrown)\n\td2\tvalue of the second die (0 if not thrown)\n\td3\tvalue of the third die (0 if not thrown)\n\td4\tvalue of the fourth die (0 if not thrown)\n\td5\tvalue of the fifth die (0 if not thrown)\n\tc\tcombination to be obtained\n";
        std.io.getStdOut().writeAll(help) catch {
            std.debug.print("failed to write to stdout\n", .{});
            return 84;
        };
        return 0;
    }

    var dice_numbers: [5]u8 = undefined;
    var dice_count: u8 = 0;

    var combination: combinations = undefined;
    var combination_nb: [2]u8 = undefined;

    for (args[1..]) |arg| {
        if (dice_count < 5) {
            if (arg.len != 1) {
                std.debug.print("wrong dice number, too long\n", .{});
                return 84;
            }
            dice_numbers[dice_count] = std.fmt.charToDigit(arg[0], 7) catch {
                std.debug.print("wrong dice number, not a digit or digit > 6\n", .{});
                return 84;
            };
            dice_count += 1;
        } else {
            if (std.mem.startsWith(u8, arg, "pair_")) {
                if (arg.len != 6) {
                    std.debug.print("invalid combination\n", .{});
                    return 84;
                }
                combination = combinations.pair;
                combination_nb[0] = std.fmt.charToDigit(arg[5], 7) catch {
                    std.debug.print("wrong pair number, not a digit or digit > 6\n", .{});
                    return 84;
                };
            } else if (std.mem.startsWith(u8, arg, "three_")) {
                if (arg.len != 7) {
                    std.debug.print("invalid combination\n", .{});
                    return 84;
                }
                combination = combinations.three;
                combination_nb[0] = std.fmt.charToDigit(arg[6], 7) catch {
                    std.debug.print("wrong three number, not a digit or digit > 6\n", .{});
                    return 84;
                };
            } else if (std.mem.startsWith(u8, arg, "four_")) {
                if (arg.len != 6) {
                    std.debug.print("invalid combination\n", .{});
                    return 84;
                }
                combination = combinations.four;
                combination_nb[0] = std.fmt.charToDigit(arg[5], 7) catch {
                    std.debug.print("wrong four number, not a digit or digit > 6\n", .{});
                    return 84;
                };
            } else if (std.mem.startsWith(u8, arg, "full_")) {
                if (arg.len != 8) {
                    std.debug.print("invalid combination\n", .{});
                    return 84;
                }
                combination = combinations.full;
                combination_nb[0] = std.fmt.charToDigit(arg[5], 7) catch {
                    std.debug.print("wrong full first number, not a digit or digit > 6\n", .{});
                    return 84;
                };
                combination_nb[1] = std.fmt.charToDigit(arg[7], 7) catch {
                    std.debug.print("wrong full second number, not a digit or digit > 6\n", .{});
                    return 84;
                };
                if (arg[6] != '_') {
                    std.debug.print("invalid full syntax\n", .{});
                    return 84;
                }
            } else if (std.mem.startsWith(u8, arg, "straight_")) {
                if (arg.len != 10) {
                    std.debug.print("invalid combination\n", .{});
                    return 84;
                }
                combination = combinations.straight;
                combination_nb[0] = std.fmt.charToDigit(arg[9], 7) catch {
                    std.debug.print("wrong straight number, not a digit or digit > 6\n", .{});
                    return 84;
                };
                if (combination_nb[0] != 5 and combination_nb[0] != 6) {
                    std.debug.print("wrong straight number, it can only be 5 or 6\n", .{});
                    return 84;
                }
            } else if (std.mem.startsWith(u8, arg, "yams_")) {
                if (arg.len != 6) {
                    std.debug.print("invalid combination\n", .{});
                    return 84;
                }
                combination = combinations.yams;
                combination_nb[0] = std.fmt.charToDigit(arg[5], 7) catch {
                    std.debug.print("wrong yams number, not a digit or digit > 6\n", .{});
                    return 84;
                };
            } else {
                std.debug.print("wrong combination\n", .{});
                return 84;
            }
        }
    }

    if (combination_nb[0] == 0 or combination_nb[1] == 0) {
        std.debug.print("combination number cannot be 0\n", .{});
        return 84;
    }

LucasSantos91 · February 16, 2024, 5:01pm

I’m assuming 84 is an error code. You should at least give it a name. It looks like the result is either 0 or 84, so you could return a bool, an enum or an `Error!void". All three options should generate the same machine code, they would just have different ergonomic properties.
About the parsing, startsWith isn’t great for choosing between multiple options. The problem is that each time it is called, it forgets all the infornation it already has. For instance, the first test checks if the string is “pairs”, suppose it sees that the first letter is an “f”. Testing for “three” is a waste, you already know it can’t be that.
When choosing amongst multiple options, usually the most efficient implementation is a tree. Something like this:

return switch(string[0]){
  'p' => .pair,
  'f' => switch (string[1]){
    'o' => .four,
    'u' => .full,
    else => error.parseError,
  },
// So on...
}

You get the gist. At each step in the tree, you preserve the information you already had. Once you’ve seen an “f”, this has to be either a .four, a .full, or an error.
This implementation assumes no typos. If the user types “pour”, it will bind to “.pair”. This may or may not be desirable. You can introduce more or less error checking, depending on your needs.

AndrewCodeDev · February 16, 2024, 6:32pm

You may be able to combine the two…

So for instance, if we wanted to be strict about spelling errors, you can use startsWith after you’ve found your potential branch.

return switch(string[0]){
  'p' => {
       return if (std.mem.startsWith(u8, string[1..], "air")) 
           .pair else error.parseError;
   },
   // So on...
}

At least then you can call startsWith much less often. And if you only have 1 u8, string[1..] is just an empty slice.

Also, welcome to the forum @PasVegan

PasVegan · February 16, 2024, 7:33pm

Thanks both of you ,
I understand what you said about checking the combination. But what would be the best way to parse the numbers ?
Do I just make another switch case after the parsing that check each combination and get the digits I want by hard coding their index ?

AndrewCodeDev · February 16, 2024, 10:22pm

You may be able to directly switch on your numeric cases… kinda like so…

// some case statement branch we're in
// => {
    switch (number) {
       1 => // do something
       // and so on...
    }
//}

That way, you could try to handle the branches… for instances I see an example here:

You could probably transform that into something like…

// some case statement branch we're in
// => {
    switch (combination_nb[0]) {
       5, 6 => // good path
       else => // error
    }
//}

I’d have to see the example reworked because a lot could change when you refactor those outer switch branches. That one case is not super convincing, but without knowing more about where you’re going with this, I’m willing to wager you could condense some of those checks.

That’s especially true if you have multiple combinations that could be valid/invalid under a single parent branch where the child branch could you to something else.

Honestly, if it were me… I’d start off by check all the digits in your string first so you can remove some of the redundancy.

for (string) |char| {
    if (std.ascii.isDigit(char)) {
        // if the digit is too big, handle it once here...
    }
}

That way you can reduce some of these conglomerated checks:

So you could then get rid of the "digit > 6" checks

I typically find that validating my data first makes my life easier. If we were talking about massive strings, then that may be insufferable. In this case, I doubt you’d be able to measure a performance hit on small strings.

I’m a little confused about these errors as they are - I think it would be good to separate your concerns out… here’s why…

Let’s zoom in on one branch of your code here:

combination_nb[0] = std.fmt.charToDigit(arg[5], 7) catch {
    std.debug.print("wrong four number, not a digit or digit > 6\n", .{});
    return 84;
};

So, ostensibly, your error is saying "wrong four number, not a digit or digit > 6". We’re getting there via a catch… but what are we catching?

pub fn charToDigit(c: u8, base: u8) (error{InvalidCharacter}!u8) {
    const value = switch (c) {
        '0'...'9' => c - '0',
        'A'...'Z' => c - 'A' + 10,
        'a'...'z' => c - 'a' + 10,
        else => return error.InvalidCharacter,
    };

    if (value >= base) return error.InvalidCharacter;

    return value;
}

The only thing that this communicates is that we either weren’t in [a-z, A-Z, 0-9] or the parsed number was greater than the base. I don’t see how that is related to it being a "wrong four number"… see what I’m saying? The catch there only communicates (in my mind) the last 2/3rds of the error message. Maybe I’m missing something here - happens from time to time, lol.

Anyhow, those are some things to consider

dude_the_builder · February 17, 2024, 9:51pm

Late to the party here but I wanted to provide a full example that covers a few recommended strategies that can be applied here. Explaining this in prose would be monumental so I’ll let the code and comments do the talking and if you have any doubts don’t hesitate in asking here. I just did the sample for the full and pair combinations but extending to the rest is pretty much the same idea.

const std = @import("std");

// Type names are PascalCase and usually not plural.
// A tagged union is perfect for grouping variants of
// a common type, in this case the combinations.
const Combination = union(enum) {
    full: Full,
    pair: Pair,

    // This function dispatches to the variants.
    fn parse(args: []const []const u8) !Combination {
        if (combinations.get(args[0])) |v| {
            // We switch on the tagged union.
            return switch (v) {
                .full => .{ .full = try Full.init(args[1..]) },
                .pair => .{ .pair = try Pair.init(args[1..]) },
            };
        }

        return error.InvalidCombination;
    }
};

const Full = struct {
    // Default values so we can easily
    // instantiate in the comptime map.
    n1: u8 = 0,
    n2: u8 = 0,

    // The init function is specific to each variant.
    fn init(args: []const []const u8) !Full {
        if (args.len < 2) return error.MissingFullArg;

        return .{
            .n1 = try parseDice(args[0]),
            .n2 = try parseDice(args[1]),
        };
    }
};

const Pair = struct {
    n: u8 = 0,

    fn init(args: []const []const u8) !Pair {
        if (args.len < 1) return error.MissingPairArg;
        return .{ .n = try parseDice(args[0]) };
    }
};

// Like a hash map, but where all keys and values have
// to be known at compile time. Great for matching strings.
// Here the values are just placeholders so we can switch
// on their tagged union variant type.
const combinations = std.ComptimeStringMap(Combination, .{
    .{ "full", .{ .full = .{} } },
    .{ "pair", .{ .pair = .{} } },
});

// Use a function to reduce code duplication and
// provide convenient error handling scope.
fn parseDice(arg: []const u8) !u8 {
    const n = try std.fmt.parseInt(u8, arg, 10);
    if (n > 6) return error.InvalidDice;
    return n;
}

// Main usually returns an error or void.
pub fn main() !void {
    // Keep the deinit right after the init.
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer _ = gpa.deinit();
    const allocator = gpa.allocator();

    // Use try unless you really have to handle an error.
    const args = try std.process.argsAlloc(allocator);
    defer std.process.argsFree(allocator, args);

    // You probably want the usage on wrong number of args too.
    if ((args.len != 2 and args.len < 8) or std.mem.eql(u8, args[1], "-h")) {
        // We have multiline strings.
        const help =
            \\USAGE
            \\    ./201yams d1 d2 d3 d4 d5 c cd1 [cd2]
            \\
            \\DESCRIPTION
            \\    d1    value of the first die (0 if not thrown)
            \\    d2    value of the second die (0 if not thrown)
            \\    d3    value of the third die (0 if not thrown)
            \\    d4    value of the fourth die (0 if not thrown)
            \\    d5    value of the fifth die (0 if not thrown)
            \\    c     combination to be obtained
            \\    cd1   first dice in combination
            \\    [cd2] second dice in combination (if applicable)
            \\
        ;

        // Use try unless you really have to handle an error.
        try std.io.getStdOut().writeAll(help);

        return;
    }

    var dice_numbers: [5]u8 = undefined;

    // Factoring out function makes this tidy.
    for (args[1..6], 0..) |arg, i| dice_numbers[i] = try parseDice(arg);

    const combination = try Combination.parse(args[6..]);

    std.debug.print("Combination is {s}\n", .{@tagName(combination)});
}

PasVegan · February 19, 2024, 9:42am

Thank you for taking time to answer me. Your response really feel like the zig way to do things, I like it !

PasVegan · February 20, 2024, 1:46pm

I refactored everything, here the link to the full code: https://github.com/PasVegan/yams-calculator

If you have the time to check it and give me feedback I would really appreciate it .

dude_the_builder · February 21, 2024, 12:08am

I’d say the code is much better, nice job! I believe you have to modify the usage message because it doesn’t match the actual way to specify combinations you’re using in the code.