Can this compiler optimize two switch statements into one? Is that even possible?

I tested two different switch initialization syntaxes, and they produce different assembly code. One variant of A uses more cmp instructions, which doesn’t seem to be optimally optimized. Is the compiler capable of optimizing variant A into variant B?

Here is the URL I used for testing: Compiler Explorer

const std = @import("std");

export fn initA(cond: i32) void {
    const num1: i32 = switch(cond) {
        1 => num1_with_cond_1,
        2 => num1_with_cond_2,
        else => num1_with_cond_else,
    };

    const num2: i32 = switch(cond) {
        1 => num2_with_cond_1,
        2 => num2_with_cond_2,
        100 => num2_with_cond_3,
        else => num2_with_cond_else,
    };

    testPrintI32(num1);
    testPrintI32(num2);
}

export fn initB(cond: i32) void {
     var num1: i32 = undefined;
     var num2: i32 = undefined;

     switch(cond) {
        1 => {
            num1 = num1_with_cond_1;
            num2 = num2_with_cond_1;
        },
        2 => {
            num1 = num1_with_cond_2;
            num2 = num2_with_cond_2;
        },
        100 => {
            num1 = num1_with_cond_else;
            num2 = num2_with_cond_3;
        },
        else => {
            num1 = num1_with_cond_else;
            num2 = num2_with_cond_else;
        }
     }
    testPrintI32(num1);
    testPrintI32(num2);
}

export fn testPrintI32(num: i32) void {
    std.debug.print("{d}", .{num});
}

const num1_with_cond_1 = 453;
const num1_with_cond_2 = 879;
const num1_with_cond_else = 134;

const num2_with_cond_1 = 1334;
const num2_with_cond_2 = 3345;
const num2_with_cond_3 = 5456;
const num2_with_cond_else = 9789;

In the assembly code for initA, there are five cmp instructions, and both the numbers 2 and 1 are compared twice.

initA:
        push    rbp
        mov     rbp, rsp
        push    rbx
        push    rax
        cmp     edi, 100
        je      .LBB0_5
        cmp     edi, 2
        je      .LBB0_4
        mov     ebx, 1334
        cmp     edi, 1
        je      .LBB0_3
        mov     ebx, 9789
        jmp     .LBB0_3
.LBB0_4:
        mov     ebx, 3345
        jmp     .LBB0_3
.LBB0_5:
        mov     ebx, 5456
.LBB0_3:
        cmp     edi, 2
        mov     eax, 879
        mov     ecx, 134
        cmove   ecx, eax
        cmp     edi, 1
        mov     edi, 453
        cmovne  edi, ecx
        call    testPrintI32
        mov     edi, ebx
        add     rsp, 8
        pop     rbx
        pop     rbp
        jmp     testPrintI32

In the assembly code for initB, there are only three cmp instructions, which aligns with expectations.


initB:
        push    rbp
        mov     rbp, rsp
        push    rbx
        push    rax
        cmp     edi, 100
        je      .LBB9_6
        cmp     edi, 2
        je      .LBB9_5
        cmp     edi, 1
        jne     .LBB9_7
        mov     edi, 453
        mov     ebx, 1334
        jmp     .LBB9_4
.LBB9_5:
        mov     edi, 879
        mov     ebx, 3345
        jmp     .LBB9_4
.LBB9_6:
        mov     edi, 134
        mov     ebx, 5456
        jmp     .LBB9_4
.LBB9_7:
        mov     edi, 134
        mov     ebx, 9789
.LBB9_4:
        call    testPrintI32
        mov     edi, ebx
        add     rsp, 8
        pop     rbx
        pop     rbp
        jmp     testPrintI32

Additionally, I’d like to know which initialization style is better: the Zig style or the C style?

I’ve tested another style of Zig code that can generate assembly code in the C style.

export fn initC(cond: i32) void {
     const num1: i32, const num2: i32 = switch(cond) {
        1 => .{num1_with_cond_1, num2_with_cond_1},
        2 => .{num1_with_cond_2,num2_with_cond_2},
        100 => .{num1_with_cond_else, num2_with_cond_3},
        else => .{num1_with_cond_else, num2_with_cond_else}
     };
    testPrintI32(num1);
    testPrintI32(num2);
}

initC:
        push    rbp
        mov     rbp, rsp
        pop     rbp
        jmp     initB

That last initC code snippet just seems to hop into an existing initB function which probably has the actual switch-code (especially llvm’s optimizer passes can be sneaky like that - e.g. detect that two functions are largely the same and ‘deduplicate’ the code).

Tbh though your example is sort of a worst-case (very few branches with one very large gap). Usually llvm will try to create a combination of binary search and jump tables (or ideally only a single jump table - even if there are minor gaps)

Btw:

        push    rax
        cmp     edi, 100
        je      .LBB0_5
        cmp     edi, 2
        je      .LBB0_4
        mov     ebx, 1334
        cmp     edi, 1
        je      .LBB0_3
        mov     ebx, 9789
        jmp     .LBB0_3

This sort of ‘linear search’ is extremely rare, normally the LLVM optimizer is much smarter than that and at least generates a binary search. But I guess the number of branches is so low that the ‘binary search heuristic’ doesn’t kick in.

Also be aware of differences between the new x86 backend and LLVM, a poor switch-case handling in the new x86 backend was what killed debug performance in my emulators since a big switch-case with minor gaps was converted into a linear search, which reduced debug performance by at least 100x (since then this seems to have been fixed/improved).

PS:

In the assembly code for initA , there are five cmp instructions, and both the numbers 2 and 1 are compared twice.

…which kinda makes sense because there are two separate switch statements and assignments. Optimizers are smart but probably not smart enough to merge the two switches into one like you did manually in the initB function.

PPS: also generally, unless your switch statements have many more branches (>100 to >1000 or so) I wouldn’t worry too much about such details - and then the one thing that’s important is that the compiler is able to create a single jump table.

Thank you for your reply.

Regarding initC, I’ve tested the case where it doesn’t jump to initB, and the generated assembly code is essentially identical to that of initB. The subtle differences among these examples strike me as quite interesting—Zig offers multiple ways to perform initialization, and I’m curious to understand the distinctions between them more clearly.

initC:
        push    rbp
        mov     rbp, rsp
        push    rbx
        push    rax
        cmp     edi, 100
        je      .LBB7_6
        cmp     edi, 2
        je      .LBB7_5
        cmp     edi, 1
        jne     .LBB7_7
        mov     edi, 453
        mov     ebx, 1334
        jmp     .LBB7_4
.LBB7_5:
        mov     edi, 879
        mov     ebx, 3345
        jmp     .LBB7_4
.LBB7_6:
        mov     edi, 134
        mov     ebx, 5456
        jmp     .LBB7_4
.LBB7_7:
        mov     edi, 134
        mov     ebx, 9789
.LBB7_4:
        call    testPrintI32
        mov     edi, ebx
        add     rsp, 8
        pop     rbx
        pop     rbp
        jmp     testPrintI32

The initA example was adapted from my own Zig sample code with minimal changes. It has few branches but effectively illustrates the difference between the two coding styles. To a human reader, these two approaches seem logically equivalent, yet the compiler apparently interprets them differently—as you mentioned, it’s not quite “smart” enough.

On the other hand, I admit I don’t have much background in compiler theory. Do you have any book or article recommendations on this topic?

TL;DR: for your example I would definitely prefer the “switch-as-expression” form since it is more readable, the minimal amount of additional code generated shouldn’t really matter much in 99.9% of situations.

FWIW in my emulator project I use both versions equally, for instance here switch and if are used as expression:

const data: u8 = switch (addr) {
    MEMIO.RD.IN0 => ~self.in0,
    MEMIO.RD.IN1 => ~self.in1,
    MEMIO.RD.DSW1 => self.dsw1,
    else => if (sys == .Pengo and addr == MEMIO.RD.DSW2) self.dsw2 else 0xFF,
};

…but here I’m using the C-style form because assignment happens to different ‘targets’ (counter vs waveform):

switch (addr) {
    // zig fmt: off
    AUDIO.ADDR_V1_FC0 => { snd.voices[0].counter = setNibble(snd.voices[0].counter, data, 0); },
    AUDIO.ADDR_V1_FC1 => { snd.voices[0].counter = setNibble(snd.voices[0].counter, data, 1); },
    // ...
    AUDIO.ADDR_V1_WAVE => { snd.voices[0].waveform = @truncate(data); },
    // ...

TL;DR: “it depends” :wink: