As @gonzo pointed out, the conditional is not quite right. Using std.ascii.isAlphabetic
fixes it and is equally performant, though:
pub fn alpha2_to_lookup_id(alpha2_code: []const u8) error{InvalidAlpha2Code}!usize {
if (alpha2_code.len != 2) return error.InvalidAlpha2Code;
if (!std.ascii.isAlphabetic(alpha2_code[0]) or !std.ascii.isAlphabetic(alpha2_code[1])) return error.InvalidAlpha2Code;
const a = (alpha2_code[0] | 32) - 'a';
const b = (alpha2_code[1] | 32) - 'a';
return @as(u16, a) * 26 + b;
}
2 Likes
I’d try:
EDIT: I thought about this a little more and I have some more ideas I’ll try out in about 5 hours.
(Please note I’m just on my phone right now)
(Use this to lookup inside a 1024 slot lookup table)
// TODO: would work on powerpc targets too,
// but I'm on a phone at the moment
const HAS_FAST_PDEP_AND_PEXT = blk: {
const cpu_name = builtin.cpu.model.llvm_name orelse builtin.cpu.model.name;
break :blk builtin.cpu.arch == .x86_64 and
std.Target.x86.featureSetHas(builtin.cpu.features, .bmi2) and
// pdep is microcoded (slow) on AMD architectures before Zen 3.
!std.mem.startsWith(u8, cpu_name, "bdver") and
(!std.mem.startsWith(u8, cpu_name, "znver") or cpu_name["znver".len] >= '3');
};
pub fn alpha2_to_lookup_id(alpha2_code: []const u8) error{InvalidAlpha2Code}!usize {
if (alpha2_code.len != 2) return error.InvalidAlpha2Code;
if (!std.ascii.isAlphabetic(alpha2_code[0]) or !std.ascii.isAlphabetic(alpha2_code[1])) return error.InvalidAlpha2Code;
const chars = @as(u16, @bitCast(alpha2_code[0..2].*));
const loc = pext(chars, 0b11111_00011111);
// loc is a u10 if you want it to be but I'm afraid that
// the compiler would add an extra AND instruction
return loc;
}
fn pext(src: anytype, comptime mask: @TypeOf(src)) @TypeOf(src) {
if (mask == 0) return 0;
const num_one_groups = @popCount(mask & ~(mask << 1));
if (!@inComptime() and comptime num_one_groups >= 3 and @bitSizeOf(@TypeOf(src)) <= 64 and builtin.cpu.arch == .x86_64 and
std.Target.x86.featureSetHas(builtin.cpu.features, .bmi2) and HAS_FAST_PDEP_AND_PEXT)
{
const methods = struct {
extern fn @"llvm.x86.bmi.pext.32"(u32, u32) u32;
extern fn @"llvm.x86.bmi.pext.64"(u64, u64) u64;
};
return switch (@TypeOf(src)) {
u32 => methods.@"llvm.x86.bmi.pext.32"(src, mask),
u64 => methods.@"llvm.x86.bmi.pext.64"(src, mask),
// u64, u32 => asm ("pext %[mask], %[src], %[ret]"
// : [ret] "=r" (-> @TypeOf(src)),
// : [src] "r" (src),
// [mask] "r" (mask),
// ),
else => @intCast(pext(@as(if (@bitSizeOf(@TypeOf(src)) <= 32) u32 else u64, src), mask)),
};
} else if (num_one_groups >= 4) blk: {
// Attempt to produce a `global_shift` value such that
// the return statement at the end of this block moves the desired bits into the least significant
// bit position.
comptime var global_shift: @TypeOf(src) = 0;
comptime {
var x = mask;
var target = @as(@TypeOf(src), 1) << (@bitSizeOf(@TypeOf(src)) - 1);
for (0..@popCount(x) - 1) |_| target |= target >> 1;
// The maximum sum of the garbage data. If this overflows into the target bits,
// we can't use the global_shift.
var left_overs: @TypeOf(src) = 0;
var cur_pos: @TypeOf(src) = 0;
while (true) {
const shift = (@clz(x) - cur_pos);
global_shift |= @as(@TypeOf(src), 1) << shift;
var shifted_mask = x << shift;
cur_pos = @clz(shifted_mask);
cur_pos += @clz(~(shifted_mask << cur_pos));
shifted_mask = shifted_mask << cur_pos >> cur_pos;
left_overs += shifted_mask;
if ((target & left_overs) != 0) break :blk;
if ((shifted_mask & target) != 0) break :blk;
x = shifted_mask >> shift;
if (x == 0) break;
}
}
return ((src & mask) *% global_shift) >> (@bitSizeOf(@TypeOf(src)) - @popCount(mask));
}
{
var ans: @TypeOf(src) = 0;
comptime var cur_pos = 0;
comptime var x = mask;
inline while (x != 0) {
const mask_ctz = @ctz(x);
const num_ones = @ctz(~(x >> mask_ctz));
comptime var ones = 1;
inline for (0..num_ones) |_| ones <<= 1;
ones -%= 1;
// @compileLog(std.fmt.comptimePrint("ans |= (src >> {}) & 0b{b}", .{ mask_ctz - cur_pos, (ones << cur_pos) }));
ans |= (src >> (mask_ctz - cur_pos)) & (ones << cur_pos);
cur_pos += num_ones;
inline for (0..num_ones) |_| x &= x - 1;
}
return ans;
}
}
2 Likes
HAS_FAST_PDEP_AND_PEXT
is false for me
I did some more testing. For me, the following version is about 20% faster at generating the lookup slot, however, it requires a table of size 1024 instead of 676. This may not be so bad, because the unused slots mostly come in runs of 6 (25 of these exist), which means that if you are looking up 16 byte ptr+len pairs, likely 4 of those 6 will be in a cache line that will never be loaded at all. There are also a bunch of empty slots at the start and end, most of which would never be loaded into cache either.
Here is my updated version:
const std = @import("std");
const builtin = @import("builtin");
const HAS_FAST_PDEP_AND_PEXT = blk: {
const cpu_name = builtin.cpu.model.llvm_name orelse builtin.cpu.model.name;
break :blk (builtin.cpu.arch.isPowerPC() and std.Target.powerpc.featureSetHas(builtin.cpu.features, .isa_v31_instructions)) or
builtin.cpu.arch.isX86() and
std.Target.x86.featureSetHas(builtin.cpu.features, .bmi2) and
// pdep is microcoded (slow) on AMD architectures before Zen 3.
!std.mem.startsWith(u8, cpu_name, "bdver") and
(!std.mem.startsWith(u8, cpu_name, "znver") or !(cpu_name["znver".len] < '3' and cpu_name["znver".len..].len == 1));
};
pub fn alpha2_to_lookup_id(alpha2_code: []const u8) error{InvalidAlpha2Code}!u16 {
if (alpha2_code.len != 2) return error.InvalidAlpha2Code;
const v: u16 = @bitCast(alpha2_code[0..2].*);
const ones: @TypeOf(v) = @bitCast([_]u8{0x01} ** @divExact(@bitSizeOf(@TypeOf(v)), 8));
const mask = comptime ones * 0x7F;
const magic_mask = ((v & comptime ones * ~@as(u7, 0x20)) ^ comptime ones * 0x40);
const non_0x40_or_0x60 = magic_mask + mask;
const not_too_high = magic_mask + comptime ones * (0x80 - 27);
const ans = pextComptime(v, 0b11111_00011111); // technically a u10 when we're done
return if ((~mask & (~v & (non_0x40_or_0x60 ^ not_too_high))) == ~mask)
ans
else
error.InvalidAlpha2Code;
}
fn pextComptime(src: anytype, comptime mask: @TypeOf(src)) @TypeOf(src) {
if (mask == 0) return 0;
const num_one_groups = @popCount(mask & ~(mask << 1));
if (!@inComptime() and comptime num_one_groups >= 2 and @bitSizeOf(@TypeOf(src)) <= 64 and HAS_FAST_PDEP_AND_PEXT) {
const methods = struct {
extern fn @"llvm.x86.bmi.pext.32"(u32, u32) u32;
extern fn @"llvm.x86.bmi.pext.64"(u64, u64) u64;
extern fn @"llvm.ppc.pextd"(u64, u64) u64;
};
return @intCast(switch (builtin.cpu.arch) {
.powerpc64, .powerpc64le => methods.@"llvm.ppc.pextd"(src, mask),
.x86, .x86_64 => if (@bitSizeOf(@TypeOf(src)) <= 32)
methods.@"llvm.x86.bmi.pext.32"(src, mask)
else
methods.@"llvm.x86.bmi.pext.64"(src, mask),
else => unreachable,
});
} else if (num_one_groups >= 4) blk: {
// Attempt to produce a `global_shift` value such that
// the return statement at the end of this block moves the desired bits into the least significant
// bit position.
comptime var global_shift: @TypeOf(src) = 0;
comptime {
var x = mask;
var target = @as(@TypeOf(src), 1) << (@bitSizeOf(@TypeOf(src)) - 1);
for (0..@popCount(x) - 1) |_| target |= target >> 1;
// The maximum sum of the garbage data. If this overflows into the target bits,
// we can't use the global_shift.
var left_overs: @TypeOf(src) = 0;
var cur_pos: @TypeOf(src) = 0;
while (true) {
const shift = (@clz(x) - cur_pos);
global_shift |= @as(@TypeOf(src), 1) << shift;
var shifted_mask = x << shift;
cur_pos = @clz(shifted_mask);
cur_pos += @clz(~(shifted_mask << cur_pos));
shifted_mask = shifted_mask << cur_pos >> cur_pos;
left_overs += shifted_mask;
if ((target & left_overs) != 0) break :blk;
if ((shifted_mask & target) != 0) break :blk;
x = shifted_mask >> shift;
if (x == 0) break;
}
}
return ((src & mask) *% global_shift) >> (@bitSizeOf(@TypeOf(src)) - @popCount(mask));
}
{
var ans: @TypeOf(src) = 0;
comptime var cur_pos = 0;
comptime var x = mask;
inline while (x != 0) {
const mask_ctz = @ctz(x);
const num_ones = @ctz(~(x >> mask_ctz));
comptime var ones = 1;
inline for (0..num_ones) |_| ones <<= 1;
ones -%= 1;
// @compileLog(std.fmt.comptimePrint("ans |= (src >> {}) & 0b{b}", .{ mask_ctz - cur_pos, (ones << cur_pos) }));
ans |= (src >> (mask_ctz - cur_pos)) & (ones << cur_pos);
cur_pos += num_ones;
inline for (0..num_ones) |_| x &= x - 1;
}
return ans;
}
}
2 Likes