@memset beaten

With function signature fixed:

const std = @import("std");

export fn improved_memset(dest: ?[*]u8, c: u8, len: usize) callconv(.c) ?[*]u8 {
    @setRuntimeSafety(false);

    const n = std.simd.suggestVectorLength(u8) orelse @sizeOf(usize);

    var i: usize = 0;
    while (i + n <= len) : (i += n) {
        const p: *align(1) @Vector(n, u8) = @ptrCast(dest.?[i..]);
        p.* = @splat(c);
    }
    while (i < len) : (i += 1) {
        dest.?[i] = c;
    }

    return dest;
}
0000000000000000 <improved_memset>:
   0: 55                    push   rbp
   1: 48 89 e5              mov    rbp,rsp
   4: 53                    push   rbx
   5: 50                    push   rax
   6: 48 89 fb              mov    rbx,rdi
   9: 48 83 fa 40           cmp    rdx,0x40
   d: 73 04                 jae    13 <improved_memset+0x13>
   f: 31 ff                 xor    edi,edi
  11: eb 24                 jmp    37 <improved_memset+0x37>
  13: 62 f2 7d 48 7a c6     vpbroadcastb zmm0,esi
  19: 31 c0                 xor    eax,eax
  1b: 0f 1f 44 00 00        nop    DWORD PTR [rax+rax*1+0x0]
  20: 62 f1 fe 48 7f 04 03  vmovdqu64 ZMMWORD PTR [rbx+rax*1],zmm0
  27: 48 8d 78 40           lea    rdi,[rax+0x40]
  2b: 48 83 e8 80           sub    rax,0xffffffffffffff80
  2f: 48 39 d0              cmp    rax,rdx
  32: 48 89 f8              mov    rax,rdi
  35: 76 e9                 jbe    20 <improved_memset+0x20>
  37: 48 29 fa              sub    rdx,rdi
  3a: 76 0f                 jbe    4b <improved_memset+0x4b>
  3c: 48 01 df              add    rdi,rbx
  3f: 40 0f b6 f6           movzx  esi,sil
  43: c5 f8 77              vzeroupper
  46: e8 00 00 00 00        call   4b <improved_memset+0x4b>
  4b: 48 89 d8              mov    rax,rbx
  4e: 48 83 c4 08           add    rsp,0x8
  52: 5b                    pop    rbx
  53: 5d                    pop    rbp
  54: c5 f8 77              vzeroupper
  57: c3                    ret

With the illegal @ptrCast fixed:

const std = @import("std");

export fn improved_memset(dest: ?[*]u8, c: u8, len: usize) callconv(.c) ?[*]u8 {
    @setRuntimeSafety(false);

    const n = std.simd.suggestVectorLength(u8) orelse @sizeOf(usize);
    const splatted: @Vector(n, u8) = @splat(c);

    var i: usize = 0;
    while (i + n <= len) : (i += n) {
        dest.?[i..][0..n].* = splatted;
    }
    while (i < len) : (i += 1) {
        dest.?[i] = c;
    }

    return dest;
}

(same machine code, except it won’t miscompile under various conditions, and it won’t become a compile error when we make language changes to vectors)

7 Likes