How to create an immutable map at compile time

squeek502 · November 17, 2024, 6:31pm

As @gonzo pointed out, the conditional is not quite right. Using std.ascii.isAlphabetic fixes it and is equally performant, though:

pub fn alpha2_to_lookup_id(alpha2_code: []const u8) error{InvalidAlpha2Code}!usize {
    if (alpha2_code.len != 2) return error.InvalidAlpha2Code;
    if (!std.ascii.isAlphabetic(alpha2_code[0]) or !std.ascii.isAlphabetic(alpha2_code[1])) return error.InvalidAlpha2Code;
    const a = (alpha2_code[0] | 32) - 'a';
    const b = (alpha2_code[1] | 32) - 'a';
    return @as(u16, a) * 26 + b;
}

Validark · November 17, 2024, 6:55pm

~~I’d try:~~

EDIT: I thought about this a little more and I have some more ideas I’ll try out in about 5 hours.

(Please note I’m just on my phone right now)

(Use this to lookup inside a 1024 slot lookup table)

// TODO: would work on powerpc targets too,
// but I'm on a phone at the moment
const HAS_FAST_PDEP_AND_PEXT = blk: {
    const cpu_name = builtin.cpu.model.llvm_name orelse builtin.cpu.model.name;
    break :blk builtin.cpu.arch == .x86_64 and
        std.Target.x86.featureSetHas(builtin.cpu.features, .bmi2) and
        // pdep is microcoded (slow) on AMD architectures before Zen 3.
        !std.mem.startsWith(u8, cpu_name, "bdver") and
        (!std.mem.startsWith(u8, cpu_name, "znver") or cpu_name["znver".len] >= '3');
};

pub fn alpha2_to_lookup_id(alpha2_code: []const u8) error{InvalidAlpha2Code}!usize {
    if (alpha2_code.len != 2) return error.InvalidAlpha2Code;
    if (!std.ascii.isAlphabetic(alpha2_code[0]) or !std.ascii.isAlphabetic(alpha2_code[1])) return error.InvalidAlpha2Code;

    const chars = @as(u16, @bitCast(alpha2_code[0..2].*));
    const loc = pext(chars, 0b11111_00011111);
    // loc is a u10 if you want it to be but I'm afraid that
    // the compiler would add an extra AND instruction
    return loc;
}

fn pext(src: anytype, comptime mask: @TypeOf(src)) @TypeOf(src) {
    if (mask == 0) return 0;
    const num_one_groups = @popCount(mask & ~(mask << 1));

    if (!@inComptime() and comptime num_one_groups >= 3 and @bitSizeOf(@TypeOf(src)) <= 64 and builtin.cpu.arch == .x86_64 and
        std.Target.x86.featureSetHas(builtin.cpu.features, .bmi2) and HAS_FAST_PDEP_AND_PEXT)
    {
        const methods = struct {
            extern fn @"llvm.x86.bmi.pext.32"(u32, u32) u32;
            extern fn @"llvm.x86.bmi.pext.64"(u64, u64) u64;
        };
        return switch (@TypeOf(src)) {
            u32 => methods.@"llvm.x86.bmi.pext.32"(src, mask),
            u64 => methods.@"llvm.x86.bmi.pext.64"(src, mask),
            // u64, u32 => asm ("pext %[mask], %[src], %[ret]"
            //     : [ret] "=r" (-> @TypeOf(src)),
            //     : [src] "r" (src),
            //       [mask] "r" (mask),
            // ),
            else => @intCast(pext(@as(if (@bitSizeOf(@TypeOf(src)) <= 32) u32 else u64, src), mask)),
        };
    } else if (num_one_groups >= 4) blk: {
        // Attempt to produce a `global_shift` value such that
        // the return statement at the end of this block moves the desired bits into the least significant
        // bit position.

        comptime var global_shift: @TypeOf(src) = 0;
        comptime {
            var x = mask;
            var target = @as(@TypeOf(src), 1) << (@bitSizeOf(@TypeOf(src)) - 1);
            for (0..@popCount(x) - 1) |_| target |= target >> 1;

            // The maximum sum of the garbage data. If this overflows into the target bits,
            // we can't use the global_shift.
            var left_overs: @TypeOf(src) = 0;
            var cur_pos: @TypeOf(src) = 0;

            while (true) {
                const shift = (@clz(x) - cur_pos);
                global_shift |= @as(@TypeOf(src), 1) << shift;
                var shifted_mask = x << shift;
                cur_pos = @clz(shifted_mask);
                cur_pos += @clz(~(shifted_mask << cur_pos));
                shifted_mask = shifted_mask << cur_pos >> cur_pos;
                left_overs += shifted_mask;
                if ((target & left_overs) != 0) break :blk;
                if ((shifted_mask & target) != 0) break :blk;
                x = shifted_mask >> shift;
                if (x == 0) break;
            }
        }

        return ((src & mask) *% global_shift) >> (@bitSizeOf(@TypeOf(src)) - @popCount(mask));
    }

    {
        var ans: @TypeOf(src) = 0;
        comptime var cur_pos = 0;
        comptime var x = mask;
        inline while (x != 0) {
            const mask_ctz = @ctz(x);
            const num_ones = @ctz(~(x >> mask_ctz));
            comptime var ones = 1;
            inline for (0..num_ones) |_| ones <<= 1;
            ones -%= 1;
            // @compileLog(std.fmt.comptimePrint("ans |= (src >> {}) & 0b{b}", .{ mask_ctz - cur_pos, (ones << cur_pos) }));
            ans |= (src >> (mask_ctz - cur_pos)) & (ones << cur_pos);
            cur_pos += num_ones;
            inline for (0..num_ones) |_| x &= x - 1;
        }
        return ans;
    }
}

squeek502 · November 18, 2024, 2:50am

HAS_FAST_PDEP_AND_PEXT is false for me

Validark · November 19, 2024, 11:24am

I did some more testing. For me, the following version is about 20% faster at generating the lookup slot, however, it requires a table of size 1024 instead of 676. This may not be so bad, because the unused slots mostly come in runs of 6 (25 of these exist), which means that if you are looking up 16 byte ptr+len pairs, likely 4 of those 6 will be in a cache line that will never be loaded at all. There are also a bunch of empty slots at the start and end, most of which would never be loaded into cache either.

Here is my updated version:

const std = @import("std");
const builtin = @import("builtin");

const HAS_FAST_PDEP_AND_PEXT = blk: {
    const cpu_name = builtin.cpu.model.llvm_name orelse builtin.cpu.model.name;
    break :blk (builtin.cpu.arch.isPowerPC() and std.Target.powerpc.featureSetHas(builtin.cpu.features, .isa_v31_instructions)) or
        builtin.cpu.arch.isX86() and
        std.Target.x86.featureSetHas(builtin.cpu.features, .bmi2) and
        // pdep is microcoded (slow) on AMD architectures before Zen 3.
        !std.mem.startsWith(u8, cpu_name, "bdver") and
        (!std.mem.startsWith(u8, cpu_name, "znver") or !(cpu_name["znver".len] < '3' and cpu_name["znver".len..].len == 1));
};

pub fn alpha2_to_lookup_id(alpha2_code: []const u8) error{InvalidAlpha2Code}!u16 {
    if (alpha2_code.len != 2) return error.InvalidAlpha2Code;
    const v: u16 = @bitCast(alpha2_code[0..2].*);
    const ones: @TypeOf(v) = @bitCast([_]u8{0x01} ** @divExact(@bitSizeOf(@TypeOf(v)), 8));
    const mask = comptime ones * 0x7F;

    const magic_mask = ((v & comptime ones * ~@as(u7, 0x20)) ^ comptime ones * 0x40);
    const non_0x40_or_0x60 = magic_mask + mask;
    const not_too_high = magic_mask + comptime ones * (0x80 - 27);

    const ans = pextComptime(v, 0b11111_00011111); // technically a u10 when we're done
    return if ((~mask & (~v & (non_0x40_or_0x60 ^ not_too_high))) == ~mask)
        ans
    else
        error.InvalidAlpha2Code;
}

fn pextComptime(src: anytype, comptime mask: @TypeOf(src)) @TypeOf(src) {
    if (mask == 0) return 0;
    const num_one_groups = @popCount(mask & ~(mask << 1));

    if (!@inComptime() and comptime num_one_groups >= 2 and @bitSizeOf(@TypeOf(src)) <= 64 and HAS_FAST_PDEP_AND_PEXT) {
        const methods = struct {
            extern fn @"llvm.x86.bmi.pext.32"(u32, u32) u32;
            extern fn @"llvm.x86.bmi.pext.64"(u64, u64) u64;
            extern fn @"llvm.ppc.pextd"(u64, u64) u64;
        };

        return @intCast(switch (builtin.cpu.arch) {
            .powerpc64, .powerpc64le => methods.@"llvm.ppc.pextd"(src, mask),
            .x86, .x86_64 => if (@bitSizeOf(@TypeOf(src)) <= 32)
                methods.@"llvm.x86.bmi.pext.32"(src, mask)
            else
                methods.@"llvm.x86.bmi.pext.64"(src, mask),
            else => unreachable,
        });
    } else if (num_one_groups >= 4) blk: {
        // Attempt to produce a `global_shift` value such that
        // the return statement at the end of this block moves the desired bits into the least significant
        // bit position.

        comptime var global_shift: @TypeOf(src) = 0;
        comptime {
            var x = mask;
            var target = @as(@TypeOf(src), 1) << (@bitSizeOf(@TypeOf(src)) - 1);
            for (0..@popCount(x) - 1) |_| target |= target >> 1;

            // The maximum sum of the garbage data. If this overflows into the target bits,
            // we can't use the global_shift.
            var left_overs: @TypeOf(src) = 0;
            var cur_pos: @TypeOf(src) = 0;

            while (true) {
                const shift = (@clz(x) - cur_pos);
                global_shift |= @as(@TypeOf(src), 1) << shift;
                var shifted_mask = x << shift;
                cur_pos = @clz(shifted_mask);
                cur_pos += @clz(~(shifted_mask << cur_pos));
                shifted_mask = shifted_mask << cur_pos >> cur_pos;
                left_overs += shifted_mask;
                if ((target & left_overs) != 0) break :blk;
                if ((shifted_mask & target) != 0) break :blk;
                x = shifted_mask >> shift;
                if (x == 0) break;
            }
        }

        return ((src & mask) *% global_shift) >> (@bitSizeOf(@TypeOf(src)) - @popCount(mask));
    }

    {
        var ans: @TypeOf(src) = 0;
        comptime var cur_pos = 0;
        comptime var x = mask;
        inline while (x != 0) {
            const mask_ctz = @ctz(x);
            const num_ones = @ctz(~(x >> mask_ctz));
            comptime var ones = 1;
            inline for (0..num_ones) |_| ones <<= 1;
            ones -%= 1;
            // @compileLog(std.fmt.comptimePrint("ans |= (src >> {}) & 0b{b}", .{ mask_ctz - cur_pos, (ones << cur_pos) }));
            ans |= (src >> (mask_ctz - cur_pos)) & (ones << cur_pos);
            cur_pos += num_ones;
            inline for (0..num_ones) |_| x &= x - 1;
        }
        return ans;
    }
}