Bare metal aligned stack var

Hello!

I need to declare an aligned 64x u8 chunk. Is below method the correct way of thinking? Any drawbacks or alternatives?

var data align(4) = @as([64]u8, undefined);

Welcome @ssilnicki-dev to ziggit :slight_smile:

align keyword must follow a variable declaration.

My preferred syntax for alignment specification on variables is:

var data: [64]u8 align(4) = undefined;
1 Like

That works, but it might be clearer to do:

var data: [64]u8 align(4) = undefined;
2 Likes

oh, so obvious :slight_smile: thank you! I’ve been struggling with this sample in docs as a starting point…:

const as_slice: []align(4) u8 = as_pointer_to_array;

… not the end of the story though,

what I have noticed, is that usage of proposed variant

export fn main() u8 {
    var data: [64]u8 align(4) = undefined;
    _ = &data;
    return 0;
}

pulls huge (140k - is too much for my target) amount of runtime into the binary:

ELF binary enrty point @ 0x2FFC2500, binary length w/0 STM32 header = 143144, binary checksum = 0xED7B96.

for unknown reason we call __aeabi_memset4, that then pulls the rest of RT:

2ffc2604 <main>:
2ffc2604:       e92d4800        push    {fp, lr}
2ffc2608:       e1a0b00d        mov     fp, sp
2ffc260c:       e24dd040        sub     sp, sp, #64     ; 0x40
2ffc2610:       e1a0000d        mov     r0, sp
2ffc2614:       e3a01040        mov     r1, #64 ; 0x40
2ffc2618:       e3a020aa        mov     r2, #170        ; 0xaa
2ffc261c:       eb008420        bl      2ffe36a4 <__aeabi_memset4>
2ffc2620:       e3a00000        mov     r0, #0
2ffc2624:       e1a0d00b        mov     sp, fp
2ffc2628:       e8bd8800        pop     {fp, pc}

while my initial proposal does not:

export fn main() u8 {
    var data align(4) = @as([64]u8, undefined);
    _ = &data;
    return 0;
}

ELF binary enrty point @ 0x2FFC2500, binary length w/0 STM32 header = 556, binary checksum = 0x9C7C.

and main itself looks neat and reasonable:

2ffc2604 <main>:
2ffc2604:       e24dd040        sub     sp, sp, #64     ; 0x40
2ffc2608:       e3a00000        mov     r0, #0
2ffc260c:       e28dd040        add     sp, sp, #64     ; 0x40
2ffc2610:       e12fff1e        bx      lr

May be I miss some build options? Any clues?

UPD: I use recent zig version @ e5d900268a if it matters

Are you compiling in debug mode? In debug, setting something to undefined will call memset to fill it with 0xAA. I don’t know why it’s not doing it in the other case, though.

2 Likes

What target are you building for?

I build for .ReleaseSmall as my build.zig suggests

1 Like
    const optimize = b.standardOptimizeOption(.{ .preferred_optimize_mode = .ReleaseSmall });
    const standard_target = b.standardTargetOptions(.{});

To be clear, your optimization option is ReleaseSmall but your target is just default. Let me ask this a different way - what are you trying to run this on? It sounds like you have limited space to work with.

There are two targets actually: one (with mentioned standard_target) is a linux desktop CLI for generation of special header for binary, which is the second target and combining the header and binary object of second target. The second target is kinda first stage boot loader for ARMv7 CPU (stm32mp157 in my case). The second target can be loaded into the platform only with such a header, which among others contains some parameters, available only after compilation of boot loader. So, I decided to combine two applications in one build. The limitation is something about 256k for bootloader to fit in SYSRAM, so it is too painful to spend 140k for unnecessary RT. The loader itself booting from SD/eMMC, so it can be any size, but 256k of SYSRAM is a barrier…

UPD: the bootloader is built as per below arrangement:

    const armv7a_features = Target.arm.Feature;
    var enabled_features = Feature.Set.empty;
    enabled_features.addFeature(@intFromEnum(armv7a_features.v7a));
    enabled_features.addFeature(@intFromEnum(armv7a_features.vldn_align));
    enabled_features.addFeature(@intFromEnum(armv7a_features.neon));
    enabled_features.addFeature(@intFromEnum(armv7a_features.vfp3d16));

    const armv7a_target = CrossTarget{
        .cpu_arch = .arm,
        .os_tag = .freestanding,
        .cpu_model = .{
            .explicit = &.{
                .name = "cortex_a7",
                .llvm_name = "cortex-a7",
                .features = .{ .ints = .{ 0, 0, 0, 0, 0 } }, // empty
            },
        },
        .abi = .eabihf,
        .cpu_features_add = enabled_features,
    };
    const resolver_target = b.resolveTargetQuery(armv7a_target);

    const fsbl_elf = b.addExecutable(.{
        .name = "fsbl",
        .root_source_file = .{ .path = "src/main.zig" },
        .target = resolver_target,
        .optimize = optimize,
    });

Although my initial approach with syntax sugar around aligned slice looked promising, employing either approach pulls 160k+ rt anyway :frowning:
It even worse that generated code intended for bare metal (freestanding), tries to print some sort of debugging in the circumstances, when no console may exist…

export fn main() void {
    var data align(4) = @as([64]u8, undefined);
    _ = getBlockSize(data[4..]);
}

fn getBlockSize(data: []align(4) u8) usize {
    return data.len;
}

above compiles into

2ffc2604 <main>:
2ffc2604:       e92d4800        push    {fp, lr}
2ffc2608:       e1a0b00d        mov     fp, sp
2ffc260c:       e24dd048        sub     sp, sp, #72     ; 0x48
2ffc2610:       e28d0008        add     r0, sp, #8
2ffc2614:       e2800004        add     r0, r0, #4
2ffc2618:       e58d0004        str     r0, [sp, #4]
2ffc261c:       e3a00001        mov     r0, #1
2ffc2620:       e3500000        cmp     r0, #0
2ffc2624:       1a000005        bne     2ffc2640 <main+0x3c>
2ffc2628:       ea000005        b       2ffc2644 <main+0x40>
2ffc262c:       e59d0004        ldr     r0, [sp, #4]
2ffc2630:       e3a0103c        mov     r1, #60 ; 0x3c
2ffc2634:       eb000015        bl      2ffc2690 <main.getBlockSize>
2ffc2638:       e1a0d00b        mov     sp, fp
2ffc263c:       e8bd8800        pop     {fp, pc}
2ffc2640:       eafffff9        b       2ffc262c <main+0x28>
2ffc2644:       e3a01040        mov     r1, #64 ; 0x40
2ffc2648:       e1a00001        mov     r0, r1
2ffc264c:       e1a0e00f        mov     lr, pc
2ffc2650:       eaffffff        b       2ffc2654 <builtin.panicOutOfBounds>
pub fn panicOutOfBounds(index: usize, len: usize) noreturn {
    @setCold(true);
    std.debug.panicExtra(null, @returnAddress(), "index out of bounds: index {d}, len {d}", .{ index, len });
}

I definitely miss something…

Maybe has nothing to do with this, but have you tried setting strip to true in your exe section of build.zig?

const fsbl_elf = b.addExecutable(.{
    .name = "fsbl",
    .root_source_file = .{ .path = "src/main.zig" },
    .target = resolver_target,
    .optimize = optimize,
    .strip = true,
});
1 Like

Just tried, but it does not help.
Anyway, I noticed that

        .optimize = .RleaseSmall,

instead of

    const optimize = b.standardOptimizeOption(.{ .preferred_optimize_mode = .ReleaseSmall });
....
        .optimize = optimize,

does the trick. Seems, that my code was ALWAYS compiled in .Debug :confused:

Indeed, reading zig compiler source code sometimes helps:

pub fn standardOptimizeOption(b: *Build, options: StandardOptimizeOptionOptions) std.builtin.OptimizeMode {
    if (options.preferred_optimize_mode) |mode| {
        if (b.option(bool, "release", "optimize for end users") orelse (b.release_mode != .off)) {
            return mode;
        } else {
            return .Debug;
        }
    }
....

Looks like .Debug is default and I wasn’t aware of it

UPD: prefered_optimeze_mode expects -Drelease=true to employ provided spec. :heavy_check_mark:

3 Likes