I want to measure the throughput of L1,L2,L3 cache and main memory on my machine. For this, I wrote the following piece of zig, which gives me throughput depending on workload size.
In order to make this benchmark truthful, I need the right amount of optimizations.
- I don’t want to optimize out any memory interaction
- I want to optimize anything else, like skip safety checks and use SIMD
const std = @import("std");
// zig build -Doptimize=ReleaseFast run
const Bench = struct {
const Self = @This();
nruns: u64,
bytes_per_run: u64,
seconds: f64,
bytes_per_second: f64,
bytes_total: u64,
const Options = struct {
bytes_total: u64 = 1e10,
bytes_per_run: u64,
};
fn work(x: u8) u8 {
return x * x + x + 1;
}
pub fn run(alloc: std.mem.Allocator, options: Options) !Self {
if (options.bytes_per_run > options.bytes_total) {
return error.BytesTotalTooSmall;
}
const xs = try alloc.alloc(u8, options.bytes_per_run);
defer {
std.mem.doNotOptimizeAway(xs);
alloc.free(xs);
}
const t_start = std.time.nanoTimestamp();
const nruns: u64 = options.bytes_total / options.bytes_per_run;
for (0..nruns) |_| {
for (xs) |*x| {
x.* +%= work(x.*); // don't optimize out these memory accesses
}
}
const t_stop = std.time.nanoTimestamp();
const seconds: f64 = std.math.lossyCast(f64, t_stop - t_start) * 1e-9;
const bytes_total = options.bytes_per_run * nruns;
const bytes_per_second = std.math.lossyCast(f64, bytes_total) / seconds;
return Self{
.nruns = nruns,
.bytes_per_run = options.bytes_per_run,
.seconds = seconds,
.bytes_per_second = bytes_per_second,
.bytes_total = bytes_total,
};
}
};
pub fn main() !void {
const alloc = std.heap.page_allocator;
const stdout = std.io.getStdOut().writer();
try stdout.print("bytes / run | bytes / s\n", .{});
for (10..31) |i| {
const bytes_per_run = std.math.pow(u64, 2, i);
const b = try Bench.run(alloc, .{ .bytes_per_run = bytes_per_run, .bytes_total = 1e10 });
try stdout.print("{e:11.3} | {e:.3}\n", .{ std.math.lossyCast(f64, b.bytes_per_run), b.bytes_per_second });
}
return;
}
I think there are some problems with this code. I tried to make it complex enough, for zig not to optimize out any memory access. Not sure if I succeeded. Also probably there is a better way to use some annotation to force the memory accesses?