Strange result in WebAssembly when optimize = Debug

I’ve been trying to figure out why my WebAssembly threads are hitting unreachable when they terminate. For this purpose I added some debug output to wasi_thread_start():

    fn print_address(address: usize) void {
        std.debug.print("address = {d}\n", .{address});
    }

    fn wasi_thread_start(tid: i32, arg: *Instance) callconv(.c) void {
        comptime assert(!builtin.single_threaded);
        __set_stack_pointer(arg.thread.memory.ptr + arg.stack_offset);
        __wasm_init_tls(arg.thread.memory.ptr + arg.tls_offset);
        @atomicStore(u32, &WasiThreadImpl.tls_thread_id, @intCast(tid), .seq_cst);

        // *** debug output 
        asm volatile ("nop");
        print_address(@intFromPtr(arg));
        print_address(@intFromPtr(&arg.thread));
        asm volatile ("nop");
        // ***

        // Finished bootstrapping, call user's procedure.
        arg.call_back(arg.raw_ptr);

        // *** debug output 
        asm volatile ("nop");
        print_address(@intFromPtr(arg));
        print_address(@intFromPtr(&arg.thread));
        asm volatile ("nop");
        // ***

        switch (arg.thread.state.swap(.completed, .seq_cst)) {
            .running => {
                // reset the Thread ID
                asm volatile (
                    \\ local.get %[ptr]
                    \\ i32.const 0
                    \\ i32.atomic.store 0
                    :
                    : [ptr] "r" (&arg.thread.tid.raw),
                );

                // Wake the main thread listening to this thread
                asm volatile (
                    \\ local.get %[ptr]
                    \\ i32.const 1 # waiters
                    \\ memory.atomic.notify 0
                    \\ drop # no need to know the waiters
                    :
                    : [ptr] "r" (&arg.thread.tid.raw),
                );
            },
            .completed => unreachable,
            .detached => {
                // restore the original stack pointer so we can free the memory
                // without having to worry about freeing the stack
                __set_stack_pointer(arg.original_stack_pointer);
                // Ensure a copy so we don't free the allocator reference itself
                var allocator = arg.thread.allocator;
                allocator.free(arg.thread.memory);
            },
        }
    }

And here’s what I’m getting:

address = 1638412
address = 1638412
address = 1638412
address = 261980

It’s quite inexplicable. So it seems the switch statement at the bottom is reaching unreachable because it’s getting a value from arg.thread.state that isn’t among the possible ones.

Only happens when optimize is Debug. When it’s ReleaseSafe, the output is as one would expect:

address = 1638412
address = 1638412
address = 1638412
address = 1638412

Okay, I managed to fix the issue. Somewhere else in my code I replaced a call to std.atomic.Value(T).store() with @atomicStore() and the problem went away:

        // ptr.value.store(value, .release);
        @atomicStore(u32, &ptr.value.raw, value, .release);

No idea why this would fix it. Somehow the inline call isn’t getting expanded correctly and is corrupting the stack.

Has nothing to do with std.atomic.Value it turns out. The actual problem is a debug stack frame gets generated before wasi_thread_start() has a chance to set the thread’s stack pointer. The solution is to use a naked function to set the stack pointer first:

    comptime {
        if (!builtin.single_threaded) {
            @export(&wasi_thread_start, .{ .name = "wasi_thread_start" });
            @export(&wasi_thread_start_impl, .{ .name = "wasi_thread_start_impl", .visibility = .hidden });
        }
    }

    /// Called by the host environment after thread creation.
    fn wasi_thread_start(_: i32, arg: *Instance) callconv(.naked) void {
        comptime assert(!builtin.single_threaded);
        __set_stack_pointer(arg.thread.memory.ptr + arg.stack_offset);
        asm volatile (
            \\ local.get 0
            \\ local.get 1
            \\ call wasi_thread_start_impl
            \\ return
        );
    }

    fn wasi_thread_start_impl(tid: i32, arg: *Instance) callconv(.c) void {
        __wasm_init_tls(arg.thread.memory.ptr + arg.tls_offset);
        // ...
    }
1 Like