Efficient runtime equivalent of `@unionInit`

I have a runtime-known active union tag and a type-erased runtime-known payload. How can I efficiently (in terms of speed and code size) initialize a union from them?

After a couple of hours of trial and error, the closest I’ve come to an ideal generic solution is this monstrosity:

pub fn runtimeUnionInit(
    comptime Union: type,
    active_tag: @typeInfo(Union).@"union".tag_type.?,
    payload: *const anyopaque,
) Union {
    var result: Union = switch (active_tag) {
        inline else => |tag| @unionInit(Union, @tagName(tag), undefined),
    };
    const result_payload_bytes: [*]u8 = switch (active_tag) {
        inline else => |tag| @ptrCast(&@field(result, @tagName(tag))),
    };
    const payload_size: usize = switch (active_tag) {
        inline else => |tag| @sizeOf(@FieldType(Union, @tagName(tag))),
    };
    @memcpy(result_payload_bytes[0..payload_size], @as([*]const u8, @ptrCast(payload)));
    return result;
}

The seperate switches are necessary in order to coerce the compiler/optimizer into constructing SoA lookup tables. Looking at the disassembly using Godbolt (https://zig.godbolt.org/z/W1chY6cnj) this looks pretty good and scales well for unions with many disorderly fields:

example.getU:
        push    rbp
        mov     rbp, rsp
        mov     eax, offset example.u
        pop     rbp
        ret

example.setU:
        push    rbp
        mov     rbp, rsp
        sub     rsp, 32
        xor     dil, 16
        movzx   eax, dil
        and     eax, 31
        shl     eax, 3
        mov     rcx, qword ptr [rax + .Lswitch.table.example.setU]
        mov     rdx, qword ptr [rax + .Lswitch.table.example.setU.1]
        mov     rax, qword ptr [rcx + 16]
        mov     qword ptr [rbp - 16], rax
        movups  xmm0, xmmword ptr [rcx]
        movaps  xmmword ptr [rbp - 32], xmm0
        lea     rdi, [rbp - 32]
        call    memcpy@PLT
        movaps  xmm0, xmmword ptr [rbp - 32]
        movups  xmmword ptr [rip + example.u], xmm0
        mov     rax, qword ptr [rbp - 16]
        mov     qword ptr [rip + example.u+16], rax
        add     rsp, 32
        pop     rbp
        ret

__anon_650:
        .zero   16
        .byte   0
        .zero   7

__anon_657:
        .zero   16
        .byte   1
        .zero   7

__anon_664:
        .zero   16
        .byte   2
        .zero   7

__anon_671:
        .zero   16
        .byte   3
        .zero   7

__anon_678:
        .zero   16
        .byte   4
        .zero   7

__anon_685:
        .zero   16
        .byte   5
        .zero   7

__anon_692:
        .zero   16
        .byte   6
        .zero   7

__anon_699:
        .zero   16
        .byte   7
        .zero   7

__anon_706:
        .zero   16
        .byte   8
        .zero   7

__anon_713:
        .zero   16
        .byte   9
        .zero   7

__anon_722:
        .zero   16
        .byte   10
        .zero   7

__anon_729:
        .zero   16
        .byte   11
        .zero   7

__anon_736:
        .zero   16
        .byte   12
        .zero   7

__anon_743:
        .zero   16
        .byte   13
        .zero   7

__anon_750:
        .zero   16
        .byte   14
        .zero   7

__anon_757:
        .zero   16
        .byte   15
        .zero   7

__anon_764:
        .zero   16
        .byte   16
        .zero   7

__anon_771:
        .zero   16
        .byte   17
        .zero   7

__anon_778:
        .zero   16
        .byte   18
        .zero   7

__anon_785:
        .zero   16
        .byte   19
        .zero   7

__anon_798:
        .zero   16
        .byte   20
        .zero   7

__anon_805:
        .zero   16
        .byte   21
        .zero   7

__anon_812:
        .zero   16
        .byte   22
        .zero   7

__anon_819:
        .zero   16
        .byte   23
        .zero   7

__anon_826:
        .zero   16
        .byte   24
        .zero   7

__anon_834:
        .zero   16
        .byte   25
        .zero   7

__anon_842:
        .zero   16
        .byte   26
        .zero   7

__anon_850:
        .zero   16
        .byte   27
        .zero   7

__anon_858:
        .zero   16
        .byte   28
        .zero   7

__anon_866:
        .zero   16
        .byte   29
        .zero   7

.Lswitch.table.example.setU:
        .quad   __anon_764
        .quad   __anon_771
        .quad   __anon_778
        .quad   __anon_785
        .quad   __anon_798
        .quad   __anon_805
        .quad   __anon_812
        .quad   __anon_819
        .quad   __anon_826
        .quad   __anon_834
        .quad   __anon_842
        .quad   __anon_850
        .quad   __anon_858
        .quad   __anon_866
        .zero   8
        .zero   8
        .quad   __anon_650
        .quad   __anon_657
        .quad   __anon_664
        .quad   __anon_671
        .quad   __anon_678
        .quad   __anon_685
        .quad   __anon_692
        .quad   __anon_699
        .quad   __anon_706
        .quad   __anon_713
        .quad   __anon_722
        .quad   __anon_729
        .quad   __anon_736
        .quad   __anon_743
        .quad   __anon_750
        .quad   __anon_757

.Lswitch.table.example.setU.1:
        .quad   4
        .quad   4
        .quad   4
        .quad   4
        .quad   16
        .quad   16
        .quad   16
        .quad   16
        .quad   16
        .quad   16
        .quad   16
        .quad   16
        .quad   16
        .quad   16
        .zero   8
        .zero   8
        .quad   0
        .quad   0
        .quad   0
        .quad   0
        .quad   0
        .quad   0
        .quad   0
        .quad   0
        .quad   0
        .quad   0
        .quad   4
        .quad   4
        .quad   4
        .quad   4
        .quad   4
        .quad   4

getU = example.getU
setU = example.setU
(Zig source code)
pub fn runtimeUnionInit(
    comptime Union: type,
    active_tag: @typeInfo(Union).@"union".tag_type.?,
    payload: *const anyopaque,
) Union {
    var result: Union = switch (active_tag) {
        inline else => |tag| @unionInit(Union, @tagName(tag), undefined),
    };
    const result_payload_bytes: [*]u8 = switch (active_tag) {
        inline else => |tag| @ptrCast(&@field(result, @tagName(tag))),
    };
    const payload_size: usize = switch (active_tag) {
        inline else => |tag| @sizeOf(@FieldType(Union, @tagName(tag))),
    };
    @memcpy(result_payload_bytes[0..payload_size], @as([*]const u8, @ptrCast(payload)));
    return result;
}

const U = union(enum) {
    void0: void,
    void1: void,
    void2: void,
    void3: void,
    void4: void,
    void5: void,
    void6: void,
    void7: void,
    void8: void,
    void9: void,
    int0: i32,
    int1: i32,
    int2: i32,
    int3: i32,
    int4: i32,
    int5: i32,
    int6: i32,
    int7: i32,
    int8: i32,
    int9: i32,
    string0: []const u8,
    string1: []const u8,
    string2: []const u8,
    string3: []const u8,
    string4: []const u8,
    string5: []const u8,
    string6: []const u8,
    string7: []const u8,
    string8: []const u8,
    string9: []const u8,
};

var u: U = undefined;

export fn setU(tag: u32, payload: *const anyopaque) void {
    u = runtimeUnionInit(U, @enumFromInt(tag), payload);
}

export fn getU() *const anyopaque {
    return &u;
}

However, one pretty significant mistake the compiler makes is that it constructs a lookup table of all possible uninitialized payloads, roughly equivalent to this:

const unions = [_]Union{
    .{ .field0 = undefined },
    .{ .field1 = undefined },
    .{ .field2 = undefined },
    // ...
    .{ .fieldn = undefined },
};
var result: Union = unions[i];

This results in a lot of unnecessary binary bloat, especially if the union has one or a few fields that have a significantly larger size than the rest (try adding padding: [256]u8 to the Godbolt repro to see for yourself). See also this issue: Inefficient handling of initialization to undefined with structs, unions, optionals and error unions · Issue #24313 · ziglang/zig · GitHub

What I really want is for the compiler to just do this:

var result: Union = undefined;
result.__tag = active_tag;
const result_payload_bytes: [*]u8 = @ptrCast(&result.__payload);
const payload_size = lookUpPayloadSize(active_tag);
@memcpy(result_payload_bytes[0..payload_size], @as([*]const u8, @ptrCast(payload)));
return result;

But I can’t find any way to express this so that the compiler takes the hint, without explicitly reinterpreting memory and making assumptions about the layout of tagged unions (which I would prefer to avoid due to risk of breakage in the future). I’ve been searching threads and issues and grepping the compiler codebase for @unionInit to look for similar patterns but I’m coming up empty.

Does anyone have any ideas, or is this the best we can do without compiler improvements and/or language changes?

3 Likes

Would degrading to an extern struct containing a tag and extern union fit here?

1 Like

In my specific case the answer is unfortunately no. I’m exploring this in an attempt to move common code paths in generic code to a shared non-generic function, in order to reduce binary bloat from instantiated generic functions, and I would prefer to have the public API look the same and use proper tagged unions.

But in other situations in less publicly exposed code I would consider this a viable alternative, e.g. taking inspiration from MultiArrayList:

const Elem = switch (@typeInfo(T)) {
    .@"struct" => T,
    .@"union" => |u| struct {
        pub const Bare = Bare: {
            var field_names: [u.fields.len][]const u8 = undefined;
            var field_types: [u.fields.len]type = undefined;
            var field_attrs: [u.fields.len]std.builtin.Type.UnionField.Attributes = undefined;
            for (u.fields, &field_names, &field_types, &field_attrs) |field, *name, *Type, *attrs| {
                name.* = field.name;
                Type.* = field.type;
                attrs.* = .{ .@"align" = field.alignment };
            }
            break :Bare @Union(u.layout, null, &field_names, &field_types, &field_attrs);
        };
        pub const Tag =
            u.tag_type orelse @compileError("MultiArrayList does not support untagged unions");
        tags: Tag,
        data: Bare,

        pub fn fromT(outer: T) @This() {
            const tag = meta.activeTag(outer);
            return .{
                .tags = tag,
                .data = switch (tag) {
                    inline else => |t| @unionInit(Bare, @tagName(t), @field(outer, @tagName(t))),
                },
            };
        }
        pub fn toT(tag: Tag, bare: Bare) T {
            return switch (tag) {
                inline else => |t| @unionInit(T, @tagName(t), @field(bare, @tagName(t))),
            };
        }
    },
    else => @compileError("MultiArrayList only supports structs and tagged unions"),
};

(However note that toT above has the same perf/bloat issues as my example.)

1 Like

Monomorphising manually probably ends up with just as much code as letting the compiler do it. I suppose if you’re careless about how many variants you instantiate, letting the compiler do it can tend to bloat, but then that’s how you’ve chosen to use it.

Using a manually tagged union gets you the code gen you’re after, at the cost of type safety.

It is not just the undefined not being optimised out, but the whole inline else(s) generate suboptimal tables in general; which for smaller types, like the examples you linked, contributes more to the bloat than the undefined does!!

Unfortunately that means if you convert back to a zig tagged union, you get most of the bloat back…

I did find other ways to remove the excessive undefined in the tables, but they had even worse codegen that was a net increase in size.

Either zig needs to improve optimisations for inline switches, or needs to provide a way to directly access the tag


that being said, I am assuming you want smaller size; though considering how much smaller a manual tag is, it is most certainly faster.