Ok so this is a bit of a silly subject, and the code is even more silly, but there are some things that I fail to understand, so to give you some context, yesterday I was on instagram, and I saw a reel of a guy drag racing python/C++/C# on a simple program that just prints numbers from 0 to 10M. C# was ahead and I though there is no way, I didn’t bother to try to reproduce it, but I’ve done my dragracing attempt, using C/C++/Zig, This is a very dumb experiment, I spawn 32 threads, and they simply increase a global I until it reaches a limit,
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
static const size_t limit = 10000000;
static size_t i = 0;
pthread_mutex_t lock;
static void *work (void *arg) {
assert (arg != NULL);
const size_t local_limit = limit - *((size_t *) arg);
free (arg);
while (1) {
pthread_mutex_lock (&lock);
if (i <= local_limit) {
printf ("%lu\n", i++);
} else {
pthread_mutex_unlock (&lock);
return (NULL);
}
pthread_mutex_unlock (&lock);
}
return NULL;
}
int main (void) {
pthread_t pids[32];
char buff[4096];
setvbuf (stdout, buff, _IOFBF, sizeof (buff));
pthread_mutex_init (&lock, NULL);
for (size_t j = 0; j < sizeof (pids) / sizeof (pids[0]); j += 1) {
size_t *ptr = malloc (sizeof (size_t));
*ptr = j;
pthread_create (&pids[j], NULL, work, ptr);
}
for (size_t j = 0; j < sizeof (pids) / sizeof (pids[0]); j += 1) {
pthread_join (pids[j], NULL);
}
return 0;
}
compile command
clang -O3 -march=native -mtune=native -flto -fomit-frame-pointer -DBUFSIZ=4096 \
-funroll-loops -fstrict-aliasing -fmerge-all-constants \
-funsafe-math-optimizations -ffast-math -finline-functions \
-fvectorize -fslp-vectorize -fno-plt -fno-semantic-interposition \
-fno-rtti -fvisibility=hidden -fPIE -fPIC -Wl,-O3 -Wl,--as-needed \
-Wl,--gc-sections -flto -s -static -pthread -o main main.c \
-static-libgcc
this is the C++ version
#include <iostream>
#include <pthread.h>
static const std::size_t limit = 10000000;
static std::size_t i = 0;
pthread_mutex_t lock;
static void *work (void *arg) noexcept {
const std::size_t *value = static_cast<std::size_t *> (arg);
const std::size_t local_limit = limit - *value;
delete value;
while (true) {
pthread_mutex_lock (&lock);
if (i <= local_limit)
std::cout << i++ << '\n';
else {
pthread_mutex_unlock (&lock);
return NULL;
}
pthread_mutex_unlock (&lock);
}
return NULL;
}
int main (void) {
char buffer[4096];
pthread_t pids[32];
std::cout.rdbuf()->pubsetbuf(buffer, sizeof(buffer));
pthread_mutex_init (&lock, NULL);
for (std::size_t j = 0; j < sizeof (pids) / sizeof (pids[0]); j += 1) {
std::size_t *ptr = new std::size_t (j);
pthread_create (&pids[j], NULL, work, ptr);
}
for (std::size_t j = 0; j < sizeof (pids) / sizeof (pids[0]); j += 1) {
pthread_join (pids[j], NULL);
}
return 0;
}
and this is the compile command
g++ -O3 -march=native -mtune=native -flto -fomit-frame-pointer -fno-exceptions \
-funroll-loops -fstrict-aliasing -fmerge-all-constants \
-funsafe-math-optimizations -ffast-math -finline-functions \
-fno-plt -fno-semantic-interposition \
-fno-rtti -fvisibility=hidden -fPIE -fPIC -Wl,-O3 -Wl,--as-needed \
-Wl,--gc-sections -flto -s -static -pthread -static-libgcc -o main main.cpp
and finally this is the Zig code
const std = @import("std");
const thread = std.Thread;
const out = std.io.getStdOut();
var buf = std.io.bufferedWriter(out.writer());
const writer = buf.writer();
const args = struct {
allocator: std.mem.Allocator,
value: *usize,
w: @TypeOf(writer),
};
const limit = 10_000_000;
var alloc_mutex: thread.Mutex = .{};
var mutex: thread.Mutex = .{};
var i: usize = 0;
fn work(arg: args) void {
const local_limit = limit - arg.value.*;
arg.allocator.destroy(arg.value);
while (true) {
thread.Mutex.lock(&mutex);
if (i <= local_limit) {
arg.w.print("{d}\n", .{i}) catch unreachable;
i += 1;
} else {
thread.Mutex.unlock(&mutex);
return;
}
thread.Mutex.unlock(&mutex);
}
}
pub fn main() !void {
const N: usize = 32;
var tid: [N]thread = undefined;
const page_allocator = std.heap.page_allocator;
var arena = std.heap.ArenaAllocator.init(page_allocator);
defer arena.deinit();
var thread_safe_allocator: std.heap.ThreadSafeAllocator = .{ .child_allocator = arena.allocator() };
const allocator = thread_safe_allocator.allocator();
defer buf.flush() catch unreachable;
for (0..N) |j| {
const arg: args = .{
.allocator = allocator,
.value = allocator.create(usize) catch unreachable,
.w = writer,
};
arg.value.* = j;
tid[j] = thread.spawn(.{ .stack_size = 16 * 1024 * 1024, .allocator = allocator }, work, .{arg}) catch unreachable;
}
for (tid) |t| {
thread.join(t);
}
}
and in the build.zig
const exe = b.addExecutable(.{
.name = "zig",
.root_source_file = b.path("src/main.zig"),
.target = target,
.optimize = .ReleaseFast,
.strip = true,
.single_threaded = false,
.error_tracing = false,
.sanitize_thread = false,
.omit_frame_pointer = true,
.use_llvm = true,
.use_lld = true,
});
I can’t really make sense of the results I’m getting using hyperfine
❯ hyperfine -r 10 ./c/main ./cpp/main ./zig/zig-out/bin/zig
Benchmark 1: ./c/main
Time (mean ± σ): 3.085 s ± 0.015 s [User: 1.936 s, System: 43.669 s]
Range (min … max): 3.061 s … 3.111 s 10 runs
Benchmark 2: ./cpp/main
Time (mean ± σ): 3.103 s ± 0.007 s [User: 2.015 s, System: 43.875 s]
Range (min … max): 3.091 s … 3.111 s 10 runs
Benchmark 3: ./zig/zig-out/bin/zig
Time (mean ± σ): 773.8 ms ± 38.4 ms [User: 647.0 ms, System: 11048.7 ms]
Range (min … max): 713.6 ms … 805.0 ms 10 runs
Summary
./zig/zig-out/bin/zig ran
3.99 ± 0.20 times faster than ./c/main
4.01 ± 0.20 times faster than ./cpp/main
Again, I know this is a very stupid experiment, that doesn’t really tells anything about anything, and I’m positive I’m missing some expert level knowledge in C/C++ to achieve better performance, but at least there is nothing obvious to me, I’m not sure what is it that I’m doing in Zig, that makes it so much faster than the two others ?