Trouble understanding how Zig is faster than C and C++

Ok so this is a bit of a silly subject, and the code is even more silly, but there are some things that I fail to understand, so to give you some context, yesterday I was on instagram, and I saw a reel of a guy drag racing python/C++/C# on a simple program that just prints numbers from 0 to 10M. C# was ahead and I though there is no way, I didn’t bother to try to reproduce it, but I’ve done my dragracing attempt, using C/C++/Zig, This is a very dumb experiment, I spawn 32 threads, and they simply increase a global I until it reaches a limit,

#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>

static const size_t limit = 10000000;
static size_t       i     = 0;
pthread_mutex_t     lock;

static void *work (void *arg) {
        assert (arg != NULL);
        const size_t local_limit = limit - *((size_t *) arg);
        free (arg);
        while (1) {
                pthread_mutex_lock (&lock);
                if (i <= local_limit) {
                        printf ("%lu\n", i++);
                } else {
                        pthread_mutex_unlock (&lock);
                        return (NULL);
                }
                pthread_mutex_unlock (&lock);
        }
        return NULL;
}

int main (void) {
        pthread_t pids[32];

        char buff[4096];
        setvbuf (stdout, buff, _IOFBF, sizeof (buff));
        pthread_mutex_init (&lock, NULL);
        for (size_t j = 0; j < sizeof (pids) / sizeof (pids[0]); j += 1) {
                size_t *ptr = malloc (sizeof (size_t));
                *ptr        = j;
                pthread_create (&pids[j], NULL, work, ptr);
        }

        for (size_t j = 0; j < sizeof (pids) / sizeof (pids[0]); j += 1) {
                pthread_join (pids[j], NULL);
        }

        return 0;
}

compile command

 clang -O3 -march=native -mtune=native -flto -fomit-frame-pointer -DBUFSIZ=4096 \
  -funroll-loops -fstrict-aliasing -fmerge-all-constants \
  -funsafe-math-optimizations -ffast-math -finline-functions \
  -fvectorize -fslp-vectorize -fno-plt -fno-semantic-interposition \
  -fno-rtti -fvisibility=hidden -fPIE -fPIC -Wl,-O3 -Wl,--as-needed \
  -Wl,--gc-sections -flto -s -static -pthread -o main main.c \
  -static-libgcc

this is the C++ version

#include <iostream>
#include <pthread.h>

static const std::size_t limit = 10000000;
static std::size_t       i     = 0;
pthread_mutex_t          lock;

static void *work (void *arg) noexcept {
        const std::size_t *value       = static_cast<std::size_t *> (arg);
        const std::size_t  local_limit = limit - *value;
        delete value;
        while (true) {
                pthread_mutex_lock (&lock);
                if (i <= local_limit)
                        std::cout << i++ << '\n';
                else {
                        pthread_mutex_unlock (&lock);
                        return NULL;
                }
                pthread_mutex_unlock (&lock);
        }
        return NULL;
}

int main (void) {
        char buffer[4096];
        pthread_t pids[32];

        std::cout.rdbuf()->pubsetbuf(buffer, sizeof(buffer));
        pthread_mutex_init (&lock, NULL);
        for (std::size_t j = 0; j < sizeof (pids) / sizeof (pids[0]); j += 1) {
                std::size_t *ptr = new std::size_t (j);
                pthread_create (&pids[j], NULL, work, ptr);
        }

        for (std::size_t j = 0; j < sizeof (pids) / sizeof (pids[0]); j += 1) {
                pthread_join (pids[j], NULL);
        }

        return 0;
}

and this is the compile command

 g++ -O3 -march=native -mtune=native -flto -fomit-frame-pointer -fno-exceptions \
  -funroll-loops -fstrict-aliasing -fmerge-all-constants \
  -funsafe-math-optimizations -ffast-math -finline-functions \
  -fno-plt -fno-semantic-interposition \
  -fno-rtti -fvisibility=hidden -fPIE -fPIC -Wl,-O3 -Wl,--as-needed \
  -Wl,--gc-sections -flto -s -static -pthread -static-libgcc -o main main.cpp

and finally this is the Zig code

const std = @import("std");
const thread = std.Thread;

const out = std.io.getStdOut();
var buf = std.io.bufferedWriter(out.writer());
const writer = buf.writer();

const args = struct {
    allocator: std.mem.Allocator,
    value: *usize,
    w: @TypeOf(writer),
};

const limit = 10_000_000;
var alloc_mutex: thread.Mutex = .{};
var mutex: thread.Mutex = .{};
var i: usize = 0;

fn work(arg: args) void {
    const local_limit = limit - arg.value.*;
    arg.allocator.destroy(arg.value);
    while (true) {
        thread.Mutex.lock(&mutex);
        if (i <= local_limit) {
            arg.w.print("{d}\n", .{i}) catch unreachable;
            i += 1;
        } else {
            thread.Mutex.unlock(&mutex);
            return;
        }
        thread.Mutex.unlock(&mutex);
    }
}

pub fn main() !void {
    const N: usize = 32;
    var tid: [N]thread = undefined;
    const page_allocator = std.heap.page_allocator;
    var arena = std.heap.ArenaAllocator.init(page_allocator);
    defer arena.deinit();
    var thread_safe_allocator: std.heap.ThreadSafeAllocator = .{ .child_allocator = arena.allocator() };
    const allocator = thread_safe_allocator.allocator();
    defer buf.flush() catch unreachable;

    for (0..N) |j| {
        const arg: args = .{
            .allocator = allocator,
            .value = allocator.create(usize) catch unreachable,
            .w = writer,
        };
        arg.value.* = j;
        tid[j] = thread.spawn(.{ .stack_size = 16 * 1024 * 1024, .allocator = allocator }, work, .{arg}) catch unreachable;
    }

    for (tid) |t| {
        thread.join(t);
    }
}

and in the build.zig

    const exe = b.addExecutable(.{
        .name = "zig",
        .root_source_file = b.path("src/main.zig"),
        .target = target,
        .optimize = .ReleaseFast,
        .strip = true,
        .single_threaded = false,
        .error_tracing = false,
        .sanitize_thread = false,
        .omit_frame_pointer = true,
        .use_llvm = true,
        .use_lld = true,
    });

I can’t really make sense of the results I’m getting using hyperfine

❯ hyperfine -r 10 ./c/main ./cpp/main ./zig/zig-out/bin/zig
Benchmark 1: ./c/main
  Time (mean ± σ):      3.085 s ±  0.015 s    [User: 1.936 s, System: 43.669 s]
  Range (min … max):    3.061 s …  3.111 s    10 runs

Benchmark 2: ./cpp/main
  Time (mean ± σ):      3.103 s ±  0.007 s    [User: 2.015 s, System: 43.875 s]
  Range (min … max):    3.091 s …  3.111 s    10 runs

Benchmark 3: ./zig/zig-out/bin/zig
  Time (mean ± σ):     773.8 ms ±  38.4 ms    [User: 647.0 ms, System: 11048.7 ms]
  Range (min … max):   713.6 ms … 805.0 ms    10 runs

Summary
  ./zig/zig-out/bin/zig ran
    3.99 ± 0.20 times faster than ./c/main
    4.01 ± 0.20 times faster than ./cpp/main

Again, I know this is a very stupid experiment, that doesn’t really tells anything about anything, and I’m positive I’m missing some expert level knowledge in C/C++ to achieve better performance, but at least there is nothing obvious to me, I’m not sure what is it that I’m doing in Zig, that makes it so much faster than the two others ?

3 Likes

Running each through strace --summary-only might give you some clues.

2 Likes

strace --summary-only --follow-forks ./main C version

% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 98.14  326.750700         463    704298    121291 futex
  1.85    6.167854         320     19260           write
  0.00    0.009065         283        32           clone3
  0.00    0.007494          58       129           rt_sigprocmask
  0.00    0.004703         146        32           madvise
  0.00    0.004220          64        65           mprotect
  0.00    0.003386          52        64           mmap
  0.00    0.001490          29        50           munmap
  0.00    0.001444          43        33           rseq
  0.00    0.001438          43        33           set_robust_list
  0.00    0.000403         403         1           close
  0.00    0.000109         109         1           openat
  0.00    0.000038          38         1           read
  0.00    0.000000           0         5           brk
  0.00    0.000000           0         1           rt_sigaction
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           set_tid_address
  0.00    0.000000           0         1           readlinkat
  0.00    0.000000           0         1           prlimit64
  0.00    0.000000           0         1           getrandom
------ ----------- ----------- --------- --------- ----------------
100.00  332.952344         459    724011    121291 total

strace --summary-only --follow-forks ./main C++ version

% time      seconds  usecs/call     calls    errors syscall
------ ------------ ----------- --------- --------- ----------------
 99.00 10159.580616         473  21479043   1499335 futex
  1.00   102.267683          10  10000001           write
  0.00     0.002646          20       129           rt_sigprocmask
  0.00     0.002441          76        32           clone3
  0.00     0.001857          29        64           mmap
  0.00     0.001842          28        65           mprotect
  0.00     0.000996          31        32           madvise
  0.00     0.000893          19        46           munmap
  0.00     0.000717          21        33           set_robust_list
  0.00     0.000632          19        33           rseq
  0.00     0.000247         247         1           execve
  0.00     0.000039          39         1           openat
  0.00     0.000031          31         1           close
  0.00     0.000029           5         5           brk
  0.00     0.000027          27         1           fstat
  0.00     0.000022          22         1           read
  0.00     0.000021          21         1           readlinkat
  0.00     0.000006           6         1           arch_prctl
  0.00     0.000006           6         1           prlimit64
  0.00     0.000005           5         1           rt_sigaction
  0.00     0.000005           5         1           set_tid_address
  0.00     0.000005           5         1           getrandom
------ ------------ ----------- --------- --------- ----------------
100.00 10261.860766         325  31479494   1499335 total

strace --summary-only --follow-forks ./zig Zig version

% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 97.50  185.170492        1462    126636     18153 futex
  2.49    4.735216         245     19261           write
  0.01    0.010359         323        32           clone
  0.00    0.000735          22        32           mprotect
  0.00    0.000731          22        33           mmap
  0.00    0.000537          16        33           munmap
  0.00    0.000000           0         1           rt_sigaction
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           prlimit64
------ ----------- ----------- --------- --------- ----------------
100.00  189.918070        1300    146031     18153 total

Indeed I should have started here, but still I’m fairly sure my translation is quite faithful or is there things that I’m not doing correctly In C/C++ ? I mean they all spawn 32 thread, they all increase I / print to stdout until the limit, they all use mutex/lock/unlock. and they all use buffered IO, so I’m really not sure where I can scrap the last bits of performance to make it a fair comparison ?

EDIT:
I’ve reran the strace with the suggestions from @squeek502

I believe that the futex time is the wait time for the join call.
A profiler is needed.
Also, running with poop instead of hyperfine might give us a hint.

Unfortunately I can’t use poop on my system because it fails to do the perf_event_open on my Ubuntu 24

I’m not sure strace is following the threads. Maybe try adding --follow-forks?

Hum it makes even less sense, first the C/Zig version seems to make about the same number of write syscall, but the C++ version is doing way too many of them and running with strace made it take for ever.

I got the poop benchmark working here’s the result :slight_smile:

Benchmark 1 (87 runs): ./zig/zig-out/bin/zig
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           577ms ± 3.34ms     563ms …  583ms          3 ( 3%)        0%
  peak_rss            262KB ±    0       262KB …  262KB          0 ( 0%)        0%
  cpu_cycles         1.91G  ± 11.4M     1.85G  … 1.94G          10 (11%)        0%
  instructions       1.34G  ±  293K     1.34G  … 1.34G           4 ( 5%)        0%
  cache_references   33.2M  ±  456K     31.4M  … 34.2M           6 ( 7%)        0%
  cache_misses       12.7M  ±  144K     12.2M  … 13.2M          11 (13%)        0%
  branch_misses      7.40M  ± 55.9K     7.09M  … 7.49M           4 ( 5%)        0%
Benchmark 2 (18 runs): cpp/main
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          2.80s  ± 13.7ms    2.79s  … 2.85s           2 (11%)        💩+385.0% ±  0.6%
  peak_rss           3.68MB ± 76.4KB    3.54MB … 3.80MB          0 ( 0%)        💩+1305.6% ±  6.1%
  cpu_cycles         8.11G  ± 44.5M     8.08G  … 8.23G           2 (11%)        💩+325.1% ±  0.6%
  instructions       8.33G  ± 1.12M     8.33G  … 8.33G           1 ( 6%)        💩+522.9% ±  0.0%
  cache_references    399M  ± 3.42M      394M  …  407M           0 ( 0%)        💩+1102.7% ±  2.2%
  cache_misses       47.4M  ±  618K     46.4M  … 48.5M           0 ( 0%)        💩+271.9% ±  1.1%
  branch_misses      37.4M  ±  273K     37.0M  … 38.0M           0 ( 0%)        💩+405.4% ±  0.8%
Benchmark 3 (19 runs): ./c/main
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          2.69s  ± 15.1ms    2.66s  … 2.74s           2 (11%)        💩+365.6% ±  0.6%
  peak_rss            786KB ±    0       786KB …  786KB          0 ( 0%)        💩+200.0% ±  0.0%
  cpu_cycles         7.73G  ± 53.3M     7.67G  … 7.88G           2 (11%)        💩+305.3% ±  0.6%
  instructions       8.22G  ±  820K     8.21G  … 8.22G           1 ( 5%)        💩+514.5% ±  0.0%
  cache_references    382M  ± 4.35M      377M  …  395M           1 ( 5%)        💩+1051.8% ±  2.8%
  cache_misses       53.6M  ±  855K     52.6M  … 55.7M           0 ( 0%)        💩+320.9% ±  1.5%
  branch_misses      37.0M  ±  233K     36.7M  … 37.8M           2 (11%)        💩+400.5% ±  0.7%

From those results, I’m even more confused, how does the C/C++ version getting this much branch_misses ? why do they have almost 8X the amount of instructions, and the cache_misses are just ridiculous.

I must be doing something wrong, because this doesn’t make sense, I even switched the std::cout for the same C printf/setvbuf config, but still not sure why there’s such a big difference

EDIT my best guess is probably with how Zig thread works, with the tryLock/slowLock, which apparently wouldn’t cause so many issues with the cache ? and also return immediately, not sure if this is why it is better but this seems like a reasonable explanation

I suggest playing around with the payload a bit and seeing what happens. Multi-threading benchmarks tend to be very sensitive to what happens inside the lock.

Also double check that the output stream of all three is in fact printing the numbers in order, if you haven’t done that.

You can do this with Awk fairly easily:

#!/usr/bin/awk -f

BEGIN { prev = -1; result = "in order" }

{
    curr = $1
    if (curr != prev + 1) {
        result = "out of order"
    }
    prev = curr
}

END {
    print result
}

I haven’t actually tested that script but it should be fairly close.

But yeah, try something which doesn’t rely on buffered printing and see if you still get the same performance gap.

1 Like

I am wondering whether with Zig there might be some code somewhere that detects whether there is actually a connected tty/console and if not it just ends up discarding the output instead of transferring it to the terminal?

Maybe the slow part is getting rid of stdout and zig has some shortcut for that if you don’t actually use the stdout?

So maybe you would have to save the output to a file instead, to ensure the tests are equal?

1 Like

Maybe looking at the generated assembly for work could help.

Try making the Zig version use libc and see if that makes a large difference.

I decided to write the output to a file, sort it with the builtin sort of my editor, and look at the diff from a second output file, and the C and Zig are in order, but not the C++ so idk where I messed up.

1 Like

This is the difference between Zig baseline and Zig libc

❯ sudo /home/pollivie/local/bin/./poop ./zig_baseline ./zig_libc
Benchmark 1 (9 runs): ./zig_baseline
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           592ms ± 6.33ms     579ms …  597ms          1 (11%)        0%
  peak_rss            262KB ±    0       262KB …  262KB          0 ( 0%)        0%
  cpu_cycles         1.98G  ± 18.9M     1.94G  … 2.00G           1 (11%)        0%
  instructions       1.36G  ±  435K     1.36G  … 1.36G           2 (22%)        0%
  cache_references   36.7M  ±  604K     35.7M  … 37.6M           0 ( 0%)        0%
  cache_misses       13.1M  ±  194K     12.7M  … 13.5M           2 (22%)        0%
  branch_misses      7.72M  ± 83.8K     7.53M  … 7.79M           1 (11%)        0%
Benchmark 2 (8 runs): ./zig_libc
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           632ms ± 6.05ms     618ms …  636ms          2 (25%)        💩+  6.7% ±  1.1%
  peak_rss           1.70MB ± 70.1KB    1.57MB … 1.84MB          2 (25%)        💩+550.0% ± 18.9%
  cpu_cycles         1.97G  ± 4.63M     1.96G  … 1.98G           0 ( 0%)          -  0.6% ±  0.7%
  instructions       1.04G  ±  103K     1.04G  … 1.04G           1 (13%)        ⚡- 23.1% ±  0.0%
  cache_references   37.2M  ±  284K     36.9M  … 37.7M           0 ( 0%)          +  1.3% ±  1.4%
  cache_misses       13.5M  ± 40.9K     13.4M  … 13.5M           0 ( 0%)        💩+  2.6% ±  1.1%
  branch_misses      7.69M  ± 50.6K     7.63M  … 7.78M           0 ( 0%)          -  0.4% ±  0.9%

I’ve only tried with the C and Zig version : nothing really weirds me out although I’m not an assembly specialist.
godbolt C
godbolt Zig

It’s a good suggestions, but no the difference remains even when I actually watch the whole input go on, like the Zig version does output everything, in order, and I can count it with | wc -l, and the C/C++ version both take the same time, just the C++ version was weirdly slow when using strace. So still weird to me.

output in wc -l

1 Like

In godbolt Zig you must add the -OReleaseFast as argument.
When this happens, zig compiler heavily inlines the code.
The work is inlined in Thread.LinuxThreadImpl.spawn__anon_3921.Instance.entryFn. Even the mutex calls are inlined.
I believe that this is the reason for the faster execution.
zig:

.LBB20_1:
        lock            bts     dword ptr [rip + example.mutex], 0
        jb      .LBB20_2
        mov     rcx, qword ptr [rip + example.i]
        ...
.LBB20_2:
        mov     edi, offset example.mutex
        call    Thread.Mutex.FutexImpl.lockSlow
        mov     rcx, qword ptr [rip + example.i]

c:

.L3:
        mov     edi, OFFSET FLAT:lock
        call    pthread_mutex_trylock
        test    eax, eax
        jne     .L3
        mov     rsi, QWORD PTR i[rip]
3 Likes

My bad I linked the wrong trunk, but yeah I think that might be the reason, still I feel like C/C++ should be able to get closer to Zig, I mean I fully expected Zig, to be a bit behind C/C++ not by much but this is a pleasant surprise, mind you this was like the most naive implementation in each language I’m sure someone can get closer than I.

Okay, so the libc version is only slightly slower. I made the suggestion because libc is production code and I thought it might have fixes for real-world corner cases that could slow things down drastically. Good to rule that out.

The Zig version obviously enjoys the benefit of processing the formatting string at comptime, but the difference can’t be that big. Or maybe it can? Just to be sure maybe you should cImport printf() and use it instead of Zig’s own function. Eliminating IO as a factor is going to be helpful.

3 Likes
❯ sudo /home/pollivie/local/bin/./poop -d 50000 ./zig_baseline_no_printf ./zig_baseline_printf ./zig_libc_no_printf ./zig_libc_printf
Benchmark 1 (85 runs): ./zig_baseline_no_printf
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           593ms ± 9.89ms     555ms …  603ms          7 ( 8%)        0%
  peak_rss            262KB ±    0       262KB …  262KB          0 ( 0%)        0%
  cpu_cycles         1.98G  ± 17.7M     1.91G  … 2.00G          15 (18%)        0%
  instructions       1.36G  ±  497K     1.36G  … 1.36G          12 (14%)        0%
  cache_references   35.7M  ± 2.04M     27.1M  … 37.0M           7 ( 8%)        0%
  cache_misses       13.1M  ±  143K     12.7M  … 13.5M          13 (15%)        0%
  branch_misses      7.69M  ± 96.0K     7.27M  … 7.79M          13 (15%)        0%
Benchmark 2 (20 runs): ./zig_baseline_printf
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          2.55s  ± 31.2ms    2.52s  … 2.61s           0 ( 0%)        💩+330.5% ±  1.3%
  peak_rss           1.53MB ± 97.7KB    1.44MB … 1.70MB          0 ( 0%)        💩+482.5% ±  7.9%
  cpu_cycles         6.77G  ± 75.6M     6.64G  … 6.92G           0 ( 0%)        💩+242.6% ±  0.9%
  instructions       7.63G  ± 1.32M     7.63G  … 7.63G           6 (30%)        💩+461.7% ±  0.0%
  cache_references    339M  ± 9.07M      331M  …  364M           2 (10%)        💩+849.2% ±  6.0%
  cache_misses       46.5M  ± 1.11M     44.9M  … 49.1M           3 (15%)        💩+254.3% ±  1.8%
  branch_misses      22.4M  ±  401K     21.7M  … 23.5M           5 (25%)        💩+191.8% ±  1.2%
Benchmark 3 (84 runs): ./zig_libc_no_printf
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           601ms ± 3.02ms     592ms …  606ms          0 ( 0%)        💩+  1.4% ±  0.4%
  peak_rss           1.72MB ± 76.2KB    1.57MB … 1.84MB         29 (35%)        💩+555.4% ±  6.2%
  cpu_cycles         1.96G  ± 13.0M     1.93G  … 2.00G           9 (11%)          -  0.5% ±  0.2%
  instructions       1.04G  ±  295K     1.04G  … 1.04G           0 ( 0%)        ⚡- 23.1% ±  0.0%
  cache_references   37.0M  ±  830K     33.5M  … 38.7M           4 ( 5%)        💩+  3.5% ±  1.3%
  cache_misses       13.8M  ±  156K     13.4M  … 14.2M          14 (17%)        💩+  4.7% ±  0.3%
  branch_misses      7.61M  ± 56.1K     7.46M  … 7.70M           0 ( 0%)          -  1.0% ±  0.3%
Benchmark 4 (20 runs): ./zig_libc_printf
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          2.60s  ± 16.7ms    2.59s  … 2.65s           2 (10%)        💩+338.4% ±  1.0%
  peak_rss           1.60MB ±  109KB    1.44MB … 1.70MB          0 ( 0%)        💩+510.0% ±  8.8%
  cpu_cycles         6.85G  ± 46.6M     6.79G  … 6.99G           1 ( 5%)        💩+247.0% ±  0.6%
  instructions       7.63G  ±  532K     7.63G  … 7.63G           2 (10%)        💩+461.9% ±  0.0%
  cache_references    324M  ± 3.31M      320M  …  333M           0 ( 0%)        💩+808.7% ±  3.2%
  cache_misses       42.3M  ±  453K     41.6M  … 43.3M           2 (10%)        💩+221.9% ±  0.9%
  branch_misses      22.4M  ±  183K     22.3M  … 23.0M           2 (10%)        💩+191.7% ±  0.7%

I’m pretty sure you are right too, the printf version takes about 5x the amount of time that the non printf versions, this is interresting to say the least.

4 Likes