std.mem.replaceOwned: help with escape sequences

Hobbyist, coding in Zig ver. 0.15.2, compiling in/for Windows 11. In this question I may be asking you to help me find a needle in a haystack (pun intended) – if it seems like a waste of your valuable time, please let me know and I’ll move on. As an exercise, I’m trying to write a CLI util for finding all instances of a byte sequence in a file and replacing them with a different byte sequence. I’m trying to accomplish this with std.mem.replaceOwned. To represent nonprintable bytes on the command line, I want to support the use of familiar escape sequences (\n for newline, etc.), including \0NNN for byte with octal value NNN, \dNNN for byte with decimal value NNN, and \xNN for byte with hex value NN. The code I’m posting below (change.zig) is a prototype using a tiny dataset (two lines of “ABCDE”) for illustration purposes.

My problem is this: If I command change.exe C x, ABCDE is changed to ABxDE, as expected. But if I write the same command with an escape code, for example change.exe \d67 x, ABCDE is not changed. This is so even though it appears that the escape code is correctly being converted to an uppercase C, via func escape2Bytes(), as shown by the printed display. On the other hand, if I put the escape sequence in the replace string, change.exe C \d120, once again the result is the expected ABxDE. The same func escape2Bytes() is used to convert the escape sequence to bytes. I expect and assume that there is a bug in escape2Bytes(), but if so it’s too subtle for me to detect. Any pointers as to what may be going wrong would be greatly appreciate it. (I tried using std.mem.replace, but it made no difference.) Many thanks in advance. Again, I hope this is not a wild goose chase.

// change.zig
// Find and Replace Across (One) File
// CLD rev. 2026-01-04
// Zig ver. 0.15.0-dev.77+aa8aa6625

const std = @import("std");
const print = std.debug.print;
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
const zalloc = arena.allocator();

pub fn main() !void {
    defer arena.deinit();
    const args = try std.process.argsAlloc(zalloc);
    defer std.process.argsFree(zalloc, args);
    if (args.len < 3) {
        showHelp(args[0]);
        return;
    }
    if (args[1].len >= 2) {
        if (args[1][0] == '-' and (args[1][1] == '-' or args[1][1] == 'h')) {
            showHelp(args[0]);
            return;
        }
    }
    const data = [_]u8{ 'A', 'B', 'C', 'D', 'E', '\n', 'A', 'B', 'C', 'D', 'E', '\n' };
    const input = data[0..];
    print("In:\n{s}\n", .{input});
    const search_str: []u8 = try escape2Bytes(zalloc, args[1]);
    const replace_str: []u8 = try escape2Bytes(zalloc, args[2]);
    print("search : {s}\nreplace: {s}\n\n", .{ search_str, replace_str });

    // pub fn replaceOwned(comptime T: type, allocator: Allocator,
    //   input: []const T, needle: []const T, replacement: []const T)
    //   Allocator.Error![]T
    const output = try std.mem.replaceOwned(u8, zalloc, input, search_str[0..], replace_str[0..]);
    print("Out:\n{s}", .{output});
}

pub fn isBaseN(ch: u8, base: u8) bool {
    var yn: bool = false;
    if (base > 16) return yn;
    const chu: u8 = std.ascii.toUpper(ch);
    const b_digits = [_]u8{ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
    if (chu >= b_digits[0] and chu <= b_digits[base - 1]) yn = true;
    return yn;
}

pub fn escape2Bytes(allocator: anytype, arg_string: []u8) ![]u8 {
    // Convert escape sequences to bytes
    var bytes_out = try std.mem.Allocator.alloc(allocator, u8, arg_string.len);
    var i: usize = 0;
    var j: usize = 0;
    var k: usize = 0;
    var base_in: u8 = 10;
    var max_len: u2 = 0;
    var testval: u16 = 0;
    var tmp: [3]u8 = undefined;
    var tmp2: []u8 = undefined;
    while (i < arg_string.len) {
        max_len = 0;
        if (i == arg_string.len - 1) {
            bytes_out[j] = arg_string[i];
            i += 1;
            j += 1;
            break;
        }
        if (arg_string[i] != 92) {
            bytes_out[j] = arg_string[i];
            i += 1;
            j += 1;
            continue;
        }
        sw: switch (arg_string[i + 1]) {
            '\\' => {
                bytes_out[j] = 92;
                i += 2;
                j += 1;
            },
            'a' => {
                bytes_out[j] = 7;
                i += 2;
                j += 1;
            },
            'b' => {
                bytes_out[j] = 8;
                i += 2;
                j += 1;
            },
            't' => {
                bytes_out[j] = 9;
                i += 2;
                j += 1;
            },
            'n' => {
                bytes_out[j] = 10;
                i += 2;
                j += 1;
            },
            'v' => {
                bytes_out[j] = 11;
                i += 2;
                j += 1;
            },
            'f' => {
                bytes_out[j] = 12;
                i += 2;
                j += 1;
            },
            'r' => {
                bytes_out[j] = 13;
                i += 2;
                j += 1;
            },
            'e' => {
                bytes_out[j] = 27;
                i += 2;
                j += 1;
            },
            '0' => {
                base_in = 8;
                max_len = 3;
                continue :sw 255;
            },
            'd' => {
                base_in = 10;
                max_len = 3;
                continue :sw 255;
            },
            'x' => {
                base_in = 16;
                max_len = 2;
                continue :sw 255;
            },
            255 => {
                if (max_len < 1) {
                    bytes_out[j] = arg_string[i + 1];
                    i += 2;
                    j += 1;
                    continue;
                }
                i += 2;
                k = 0;
                while (i < arg_string.len and isBaseN(arg_string[i], base_in)) {
                    tmp[k] = arg_string[i];
                    i += 1;
                    k += 1;
                }
                tmp2 = tmp[0..k];
                testval = try std.fmt.parseInt(u16, tmp2, base_in);
                if (testval > 255) {
                    i -= 1;
                    k -= 1;
                } else {
                    bytes_out[j] = @intCast(testval);
                    j += 1;
                }
            },
            else => {
                bytes_out[j] = arg_string[i + 1];
                i += 2;
                j += 1;
            },
        }
    }
    for (j..bytes_out.len) |c| {
        bytes_out[c] = 0;
    }
    return bytes_out;
}

pub fn showHelp(progname: [:0]u8) void {
    print("\nGlobal Find & Replace\nUsage: {s} [\"]FIND_STR[\"] [\"]REPLACE_STR[\"]\n\nThe following escape sequences are recognized in FIND_STR and\nREPLACE_STR:\n\n   \\\\     backslash\n   \\\"     double quote\n   \\a     alert (BEL)\n   \\b     backspace\n   \\e     escape\n   \\f     form feed\n   \\n     newline\n   \\r     carriage return\n   \\t     horizontal tab\n   \\v     vertical tab\n   \\0NNN  byte with octal value NNN (1 to 3 digits)\n   \\dNNN  byte with decimal value NNN (1 to 3 digits)\n   \\xNN   byte with hexadecimal value NN (1 or 2 digits)\n", .{progname});
}

This doesn’t help directly, in that this is unlikely to be related to the bug, but hopefully you’ll find this feedback useful.

Don’t code this way.

isBaseN('R', 32) isn’t false. It’s meaningless. Either handle base 32, whatever that means to you, or make the caller handle it: error{BaseTooLarge}!bool, or forbid it: @panic("base must be 16 or less");

You’ll also want to know about std.fmt.parseInt. It does a lot of what your code is doing, and deleting code is one way to fix bugs.

I have no idea what this code is doing, and you don’t either. Obviously tmp is .... while testval is because ... and i goes with ... but k is for ...., and let’s not forget tmp2.

base_in, max_len, those are good! I can guess what they do, so if I saw them doing something which wasn’t that, it might help me figure out what’s wrong. tmp2? not so much. i is probably your primary index, usually is, but you’re not working on a three-dimensional array (right??) so what are j and k about.

I bet if you go through and make your identifiers mean something, you’ll figure out why it isn’t doing what you want it to do.

This is the problem. Try changing this line:

print("search : {s}\nreplace: {s}\n\n", .{ search_str, replace_str });

to:

print("search : {f}\nreplace: {f}\n\n", .{ std.ascii.hexEscape(search_str), std.ascii.hexEscape(replace_str) });

and it should be clear what’s going wrong.

(note that even the change.exe C \d120 version that appears to be working is also affected by this bug)

Not sure what that algorithm is supposed to do, but this is how I would do it:

// change.zig
// Find and Replace Across (One) File
// CLD rev. 2026-01-04
// Zig ver. 0.15.0-dev.77+aa8aa6625

const std = @import("std");
const mem = std.mem;
const print = std.debug.print;
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);

const data = [_]u8{ 'A', 'B', 'C', 'D', 'E', '\n', 'A', 'B', 'C', 'D', 'E', '\n' };

pub fn main() !void {
    const gpa = arena.allocator();
    defer arena.deinit();

    const args = try std.process.argsAlloc(gpa);
    defer std.process.argsFree(gpa, args);

    // "--" and "-h" are valid patterns so you can't use them the way you did
    if (args.len != 3) {
        showHelp(args[0]);
        return;
    }

    const input = data[0..];
    print("In:\n{s}\n", .{input});

    const search_str: []u8 = escape2Bytes(gpa, args[1]) catch |err| return handle("search", err);
    const replace_str: []u8 = escape2Bytes(gpa, args[2]) catch |err| return handle("replace", err);
    print("search : {s}\nreplace: {s}\n\n", .{ search_str, replace_str });

    const output = try mem.replaceOwned(u8, gpa, input, search_str[0..], replace_str[0..]);
    print("Out:\n{s}", .{output});
}

fn handle(where: []const u8, err: anyerror) anyerror {
    switch (err) {
        error.WrongEscapeSequence,
        => print("{s} pattern contains unrecognized escape sequence\n", .{where}),
        error.WrongBase10Escape,
        => print("{s} pattern contains invalid decimal escape sequence\n", .{where}),
        error.WrongBase8Escape,
        => print("{s} pattern contains invalid octal escape sequence\n", .{where}),
        error.WrongBase16Escape,
        => print("{s} pattern contains invalid hexadecimal escape sequence\n", .{where}),

        error.InvalidCharacter,
        error.Overflow,
        => print("{s} pattern contains invalid numerical escape sequence\n", .{where}),

        else => {},
    }
    return err;
}

fn getDecimalLength(buf: []const u8) !usize {
    var j: usize = 1;
    while (j < buf.len and j < 4) {
        switch (buf[j]) {
            '0'...'9' => j += 1,
            else => break,
        }
    }
    return if (j == 1) return error.WrongBase10Escape else j;
}

fn getOctalLength(buf: []const u8) !usize {
    var j: usize = 1;
    while (j < buf.len and j < 4) {
        switch (buf[j]) {
            '0'...'7' => j += 1,
            else => break,
        }
    }
    return if (j == 1) return error.WrongBase8Escape else j;
}

fn getHexLength(buf: []const u8) !usize {
    var j: usize = 1;
    while (j < buf.len and j < 3) {
        switch (buf[j]) {
            '0'...'9', 'a'...'f', 'A'...'F' => j += 1,
            else => break,
        }
    }
    return if (j == 1) return error.WrongBase16Escape else j;
}

fn escape2Bytes(gpa: mem.Allocator, arg: []u8) ![]u8 {
    var out = try std.ArrayList(u8).initCapacity(gpa, arg.len);
    var escape = false;
    var i: usize = 0;
    while (i < arg.len) : (i += 1) {
        const ch = arg[i];

        if (!escape) {
            if (ch == '\\') {
                escape = true;
            }
            else {
                try out.append(gpa, ch);
            }
            continue;
        }

        // handling escape sequences
        escape = false;

        switch (ch) {
            '\\', '"' => try out.append(gpa, ch),
            'a' => try out.append(gpa, 0x07),
            'b' => try out.append(gpa, 0x08),
            't' => try out.append(gpa, 0x09),
            'e' => try out.append(gpa, 0x1b),
            'f' => try out.append(gpa, 0x0c),
            'n' => try out.append(gpa, 0x0a),
            'r' => try out.append(gpa, 0x0d),

            'd' => {
                const j = try getDecimalLength(arg[i..]);
                const n = try std.fmt.parseInt(u8, arg[i + 1 .. i + j], 10);
                if (n > 255) return error.WrongBase10Escape;
                try out.append(gpa, n);
                i += j - 1;
            },

            '0' => {
                const j = try getOctalLength(arg[i..]);
                const n = try std.fmt.parseInt(u8, arg[i + 1 .. i + j], 8);
                if (n > 255) return error.WrongBase8Escape;
                try out.append(gpa, n);
                i += j - 1;
            },

            'x' => {
                const j = try getHexLength(arg[i..]);
                const n = try std.fmt.parseInt(u8, arg[i + 1 .. i + j], 16);
                if (n > 255) return error.WrongBase16Escape;
                try out.append(gpa, n);
                i += j - 1;
            },

            else => return error.WrongEscapeSequence,
        }
    }
    return try out.toOwnedSlice(gpa);
}

pub fn showHelp(progname: [:0]u8) void {
    print(help, .{progname});
}

const help =
    \\Global Find & Replace
    \\Usage: {s} [\"]FIND_STR[\"] [\"]REPLACE_STR[\"]
    \\
    \\The following escape sequences are recognized in FIND_STR and
    \\REPLACE_STR:
    \\
    \\   \\     backslash
    \\   \"     double quote
    \\   \a     alert (BEL)
    \\   \b     backspace
    \\   \e     escape
    \\   \f     form feed
    \\   \n     newline
    \\   \r     carriage return
    \\   \t     horizontal tab
    \\   \v     vertical tab
    \\   \0NNN  byte with octal value NNN (1 to 3 digits)
    \\   \dNNN  byte with decimal value NNN (1 to 3 digits)
    \\   \xNN   byte with hexadecimal value NN (1 or 2 digits)
;

Many thanks for the replies. Special thanks to @squeek502 for fingering the bug. In short, func escape2Bytes was outputting extraneous bytes – fixed.

@mnemnion: I appreciate the feedback and have taken in your comments. Fixed func IsBaseN; annotated what i, j, and k are doing; renamed tmp to something more descriptive; and got rid of tmp2. One of the pitfalls of programming mainly for oneself is the tendency to lapse into solipsistic shorthand.

@mg979: Thanks for taking the time to comment. I will need to study your code. At first glance, though, I like the explicit error-handling. I’ve refactored func showHelpalong the lines you suggest.

Here is the revised code:

// Find and Replace Across (One) File
// CLD rev. 2026-01-05
// Zig ver. 0.15.2

const std = @import("std");
const print = std.debug.print;
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
const zalloc = arena.allocator();

pub fn main() !void {
    defer arena.deinit();
    const args = try std.process.argsAlloc(zalloc);
    defer std.process.argsFree(zalloc, args);
    if (args.len < 3) {
        showHelp(args[0]);
        return;
    }
    if (args.len < 3) {
        showHelp(args[0]);
        return;
    }
    const data = [_]u8{ 'A', 'B', 'C', 'D', 'E', '\n', 'A', 'B', 'C', 'D', 'E', '\n' };
    const input = data[0..];
    print("In:\n{s}\n", .{input});
    const search_str: []u8 = try escape2Bytes(zalloc, args[1]);
    const replace_str: []u8 = try escape2Bytes(zalloc, args[2]);
    //print("search : {s}\nreplace: {s}\n\n", .{ search_str, replace_str });
    print("search : {f}\nreplace: {f}\n\n", .{ std.ascii.hexEscape(search_str, .lower), std.ascii.hexEscape(replace_str, .lower) });

    // pub fn replaceOwned(comptime T: type, allocator: Allocator,
    //   input: []const T, needle: []const T, replacement: []const T)
    //   Allocator.Error![]T
    const output = try std.mem.replaceOwned(u8, zalloc, input, search_str[0..], replace_str[0..]);
    print("Out:\n{s}", .{output});
}

pub fn isBaseN(ch: u8, base: u8) bool {
    var yn: bool = false;
    if (base > 16) @panic("base must be 16 or less");
    const chu: u8 = std.ascii.toUpper(ch);
    const b_digits = [_]u8{ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
    if (chu >= b_digits[0] and chu <= b_digits[base - 1]) yn = true;
    return yn;
}

pub fn escape2Bytes(allocator: anytype, arg_string: []u8) ![]u8 {
    // Convert escape sequences to bytes
    var bytes_out = try std.mem.Allocator.alloc(allocator, u8, arg_string.len);
    var base_in: u8 = 10;
    var max_len: u2 = 0;
    var testval: u16 = 0;
    var i: usize = 0; // index: arg_in
    var j: usize = 0; // index: bytes_out
    var base_digits: [3]u8 = undefined;
    var k: usize = 0; // index: base_digits
    while (i < arg_string.len) {
        max_len = 0;
        if (i == arg_string.len - 1) {
            bytes_out[j] = arg_string[i];
            i += 1;
            j += 1;
            break;
        }
        if (arg_string[i] != 92) {
            bytes_out[j] = arg_string[i];
            i += 1;
            j += 1;
            continue;
        }
        sw: switch (arg_string[i + 1]) {
            '\\' => {
                bytes_out[j] = 92;
                i += 2;
                j += 1;
            },
            'a' => {
                bytes_out[j] = 7;
                i += 2;
                j += 1;
            },
            'b' => {
                bytes_out[j] = 8;
                i += 2;
                j += 1;
            },
            't' => {
                bytes_out[j] = 9;
                i += 2;
                j += 1;
            },
            'n' => {
                bytes_out[j] = 10;
                i += 2;
                j += 1;
            },
            'v' => {
                bytes_out[j] = 11;
                i += 2;
                j += 1;
            },
            'f' => {
                bytes_out[j] = 12;
                i += 2;
                j += 1;
            },
            'r' => {
                bytes_out[j] = 13;
                i += 2;
                j += 1;
            },
            'e' => {
                bytes_out[j] = 27;
                i += 2;
                j += 1;
            },
            'o' => {
                base_in = 8;
                max_len = 3;
                continue :sw 255;
            },
            'd' => {
                base_in = 10;
                max_len = 3;
                continue :sw 255;
            },
            'x' => {
                base_in = 16;
                max_len = 2;
                continue :sw 255;
            },
            255 => {
                if (max_len < 1) {
                    bytes_out[j] = arg_string[i + 1];
                    i += 2;
                    j += 1;
                    continue;
                }
                i += 2;
                k = 0;
                while (i < arg_string.len and isBaseN(arg_string[i], base_in)) {
                    base_digits[k] = arg_string[i];
                    i += 1;
                    k += 1;
                }
                testval = try std.fmt.parseInt(u16, base_digits[0..k], base_in);
                if (testval > 255) {
                    i -= 1;
                    k -= 1;
                } else {
                    bytes_out[j] = @intCast(testval);
                    j += 1;
                }
            },
            else => {
                bytes_out[j] = arg_string[i + 1];
                i += 2;
                j += 1;
            },
        }
    }
    //return bytes_out[0..];
    return bytes_out[0..j]; // <== Critical Edit
    //------------------^
}

pub fn showHelp(progname: [:0]u8) void {
    print(help, .{progname});
}

const help =
    \\Global Find & Replace
    \\Usage: {s} [\"]FIND_STR[\"] [\"]REPLACE_STR[\"]
    \\
    \\The following escape sequences are recognized in FIND_STR and
    \\REPLACE_STR:
    \\
    \\   \\     backslash
    \\   \"     double quote
    \\   \a     alert (BEL)
    \\   \b     backspace
    \\   \e     escape
    \\   \f     form feed
    \\   \n     newline
    \\   \r     carriage return
    \\   \t     horizontal tab
    \\   \v     vertical tab
    \\   \dNNN  byte with decimal value NNN (1 to 3 digits)
    \\   \oNNN  byte with octal value NNN (1 to 3 digits)
    \\   \xNN   byte with hexadecimal value NN (1 or 2 digits)
    \\
;

1 Like

That’s multiples more comprehensible, kudos.

If I might make one additional suggestion:

You know, I don’t hate this. 0xff won’t show up in utf8, you’re pressing it into service as a labeled goto in a trenchcoat, fine.

But just to spare yourself some agony later, maybe:

const PARSE_DIGITS = 0xff; // Sentinel value, not legal UTF-8

// Then
    continue :sw PARSE_DIGITS;

// etc
    PARSE_DIGITS => { 
        ...
    },
3 Likes

Lovely, thank you.

2 Likes