Segmentation Fault trying to use dynlib for structs

I encountered another strange issue. I’m loading a plugin as a dynamic library. I want to use the de-facto standard of providing a struct with all of the plugin functions. I’m getting a segmentation fault when trying to access them.
I am not really that picky on how the plugin_info struct is structured. I would be willing to even pass strings and then look them up. I have tried all kinds of ways but they all seem to result in segfaults.

What am I doing wrong here?

Main Program

const std = @import("std");

const AddFn = *const fn (a: i32, b: i32) callconv(.C) i32;
const PluginInfo = extern struct { add: AddFn };

pub fn main() !void {
    var lib = try std.DynLib.open("zig-out/lib/libplugin.so");
    defer lib.close();

    const plugin_info_ptr: *const PluginInfo = lib.lookup(*const PluginInfo, "plugin_info") orelse {
        std.debug.print("Error: Could not find plugin_info\n", .{});
        return;
    };

    const plugin_info: PluginInfo = plugin_info_ptr.*;

    std.debug.print("{}\n", .{plugin_info.add(5, 3)});
}

Plugin

const std = @import("std");

const AddFn = *const fn (a: i32, b: i32) callconv(.C) i32;
const PluginInfo = extern struct { add: AddFn };

export const plugin_info: PluginInfo = .{
    .add = pluginAdd,
};

export fn pluginAdd(a: i32, b: i32) callconv(.C) i32 {
    return a + b;
}

The Error

Segmentation fault at address 0xd1d00
???:?:?: 0xd1d00 in ??? (???)
Unwind information for `???:0xd1d00` was not available, trace may be incomplete

/home/.../zig/lib/std/start.zig:616:37: 0x1035b6f in posixCallMainAndExit (xlatr)
            const result = root.main() catch |err| {
                                    ^
/home/.../zig/lib/std/start.zig:240:5: 0x103575d in _start (xlatr)
    asm volatile (switch (native_arch) {
    ^

I was able to lookup pluginAdd directly and verified that it works.

.add = &pluginAdd?

This results in the same error unfortunately.

plugin_info is a comptime constant. .add is being set to the offset of pluginAdd from the beginning of the plugin module (0xd1d00 in your stack trace). But when the app tries to execute it, it thinks this is an offset to its own module. Calling functions directly (or, equivalently, storing pointers to functions at compile time) only works for functions inside the same module. The proper way to call functions across modules is by querying the function pointer. In Windows, you do that with GetProcAddress, I don’t know the equivalent in Zig’s std.

1 Like

Ok that make sense. Thanks for that explanation.

I’ve never seen this pattern of struct containing pointers. I think you’re overcomplicating this. The plugin needs to provide a function called add, so just name it add:

export fn add(a: i32, b: i32) callconv(.C) i32 {
    return a + b;
}

Now in your application, you just search for add using the module handle that you obtained when you loaded the plugin.

You are most likely right. I think that is what I will do.

I did find this and in the windows version it is using GetProcAddress instead.

    pub fn lookup(self: *DlDynLib, comptime T: type, name: [:0]const u8) ?T {
        // dlsym (and other dl-functions) secretly take shadow parameter - return address on stack
        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66826
        if (@call(.never_tail, std.c.dlsym, .{ self.handle, name.ptr })) |symbol| {
            return @as(T, @ptrCast(@alignCast(symbol)));
        } else {
            return null;
        }
    }
1 Like

Yeah, I like this method and use it everywhere in plugins (in C).
So I’ve just tried to do same in Zig (but using libdl directly).

api.zig

pub const Api = extern struct {
    add: *const fn(i32,i32) i32,
    sub: *const fn(i32,i32) i32,
};

ari.zig

const Api = @import("api.zig").Api;

fn add(a: i32, b: i32) i32 {
    return a + b;
}

fn sub(a: i32, b: i32) i32 {
    return a - b;
}

export const api: Api = .{
    .add = &add,
    .sub = &sub,
};

app.zig

const std = @import("std");
const log = std.debug.print;

const dll = @cImport({
    @cInclude("dlfcn.h");
});

const Api = @import("api.zig").Api;

pub fn main() void {
    const lib = dll.dlopen("./libari.so", dll.RTLD_NOW);
    defer _ = dll.dlclose(lib);
    log("lib = {*}\n", .{lib});
    const p = dll.dlsym(lib, "api").?;
    log("p = {*}\n", .{p});
    const api: *Api = @ptrCast(@alignCast(dll.dlsym(lib, "api").?));
    const a: i32 = 10;
    const b: i32 = 7;
    log("{} + {} = {}\n", .{a, b, api.add(a, b)});
    log("{} - {} = {}\n", .{a, b, api.sub(a, b)});
}

Compilation

zig build-lib -dynamic ari.zig -O ReleaseSmall
zig build-exe app.zig -ldl -O ReleaseSmall

Let the ball rolling

dll$ ./app 
lib = *anyopaque@9d32c0
p = anyopaque@7f6c3da6c318
10 + 7 = 17
10 - 7 = 3

That’s it.

2 Likes

I looked at the source code for std.DynLib and, for most platforms, it just naively open the file. That’s not the most correct way of loading a module, because it skips rellocation, which is what would make these pointers become valid. libdl very likely does rellocation.

1 Like
const std = @import("std");
const log = std.debug.print;
const Dll = std.DynLib;

const Api = @import("api.zig").Api;

pub fn main() !void {
    var lib = try Dll.open("./libari.so");
//    log("{any}\n", .{lib});
    defer lib.close();
    const api: *Api = lib.lookup(*Api, "api").?;
    log("api = {*}\n", .{api});
    log("api.add = {*}\n", .{api.add});
    log("api.sub = {*}\n", .{api.sub});

//    const a: i32 = 10;
//    const b: i32 = 7;
//    log("{} + {} = {}\n", .{a, b, api.add(a, b)});
//    log("{} - {} = {}\n", .{a, b, api.sub(a, b)});
}
$ ./app2 
api = api.Api@7fb1fb6e6390
api.add = fn(i32, i32) i32@0           <<<<<<<<<<<<<<<<
api.sub = fn(i32, i32) i32@0           <<<<<<<<<<<<<<<<

Does this mean that DynLib just does not work?..

1 Like

It looks like it. If the dynamic library doesn’t need relocation, it should work, but it would be quite fragile. Maybe this should warrant opening an issue.

… in the meantime I’ve discovered for myself (out of curiosity) that DLL produced by Zig is quite usable from C:

a program

#include <stdio.h>
#include <dlfcn.h>

struct api {
    int (*add)(int, int);
    int (*sub)(int, int);
};

int main(void) {
    void *lib = dlopen("./libari.so", RTLD_NOW);
    struct api *api = dlsym(lib, "api");
    int a = 10;
    int b = 7;
    printf("%d + %d = %d\n", a, b, api->add(a, b));
    printf("%d - %d = %d\n", a, b, api->sub(a, b));
    dlclose(lib);
}

and it’s output:

$ ./a.out 
10 + 7 = 17
10 - 7 = 3
2 Likes

Interesting … thanks for your work on this.

It looks like your example works on Windows but not on Linux.

Also, the Windows version puts the .DLL in the bin directory with a LIB file in the lib.

I ended up testing with

const builtin = @import("builtin");
const native_os = builtin.os.tag;
//...
    const file_name = if (native_os == .windows) "zig-out/bin/testdl.dll" else "zig-out/lib/libtestdl.so";
    var lib = try Dll.open(file_name);

Looks like there is a bug here in DynLib.

Do you mean the second one with Zig’s DynLIb?.. (dlopen used in the first one is POSIX thing)

Yes, I tested the Zig DynLib.

I used zig 0.11.0, maybe in more recent versions it’s working.

My versions:

Linux: 0.14.0-dev.839+a931bfada
Windows: 0.14.0-dev.121+ab4c461b7

$ /opt/zig-0.12/zig build-exe app2.zig 
api.zig:3:10: error: extern structs cannot contain fields of type '*const fn (i32, i32) i32'
    add: *const fn(i32,i32) i32,
         ^~~~~~~~~~~~~~~~~~~~~~
api.zig:3:10: note: extern function must specify calling convention

okay…

pub const Api = extern struct {
    add: *const fn(i32,i32) callconv(.C) i32,
    sub: *const fn(i32,i32) callconv(.C) i32,
    mul: *const fn(i32,i32) callconv(.C) i32,
    div: *const fn(i32,i32) callconv(.C) i32,
};
$ ./app2
api = api.Api@7f1dac65f390
api.add = fn (i32, i32) callconv(.C) i32@0
api.sub = fn (i32, i32) callconv(.C) i32@0
Segmentation fault at address 0x0

Same picture - pointers inside the structure are zero.

I’m going to search for a bug report already existing … if not we can add one.

1 Like

Tried 0.13.0 and master 0.14.0-dev.872+a60810b5a:

api.add = fn (i32, i32) callconv(.C) i32@0
api.sub = fn (i32, i32) callconv(.C) i32@0
Segmentation fault at address 0x0

So it seems it’s really a bug.