just to help you see I’ve made a little example :
const std = @import("std");
const BoundedArray = std.BoundedArray;
const Timer = std.time.Timer;
pub fn FooAos(comptime T: type) type {
return struct {
const Self = @This();
alive: bool = true,
padding: T = 0,
pub fn init(comptime N: usize) !BoundedArray(@This(), N) {
return try BoundedArray(Self, N).init(N);
}
};
}
pub fn main() !void {
const stdout = std.io.getStdOut();
const writer = stdout.writer();
{
const ITERATION = 10;
const LEN = 128;
const foo_aos = try FooAos(u31).init(LEN);
const foo_slice = foo_aos.slice();
try writer.print("foo_aos : ITERATION {d}, LEN {d}\n", .{ ITERATION, LEN });
var timer = try Timer.start();
for (0..ITERATION) |_| {
for (foo_slice) |foo| {
std.mem.doNotOptimizeAway(foo.alive);
}
const lap = timer.lap();
try writer.print("foo_aos : time {d}us\n", .{(lap / std.time.ns_per_us)});
}
}
{
const ITERATION = 10;
const LEN = 256;
const foo_aos = try FooAos(u31).init(LEN);
const foo_slice = foo_aos.slice();
try writer.print("foo_aos : ITERATION {d}, LEN {d}\n", .{ ITERATION, LEN });
var timer = try Timer.start();
for (0..ITERATION) |_| {
for (foo_slice) |foo| {
std.mem.doNotOptimizeAway(foo.alive);
}
const lap = timer.lap();
try writer.print("foo_aos : time {d}us\n", .{(lap / std.time.ns_per_us)});
}
}
{
const ITERATION = 10;
const LEN = 1024;
const foo_aos = try FooAos(u31).init(LEN);
const foo_slice = foo_aos.slice();
try writer.print("foo_aos : ITERATION {d}, LEN {d}\n", .{ ITERATION, LEN });
var timer = try Timer.start();
for (0..ITERATION) |_| {
for (foo_slice) |foo| {
std.mem.doNotOptimizeAway(foo.alive);
}
const lap = timer.lap();
try writer.print("foo_aos : time {d}us\n", .{(lap / std.time.ns_per_us)});
}
}
}
and the same version in soa
const std = @import("std");
const BoundedArray = std.BoundedArray;
const Timer = std.time.Timer;
pub fn FooSoa(comptime T: type, comptime N: usize) type {
return struct {
alives: BoundedArray(bool, N),
paddings: BoundedArray(T, N),
pub fn init() !@This() {
return .{
.alives = try BoundedArray(bool, N).init(N),
.paddings = try BoundedArray(T, N).init(N),
};
}
};
}
pub fn main() !void {
const stdout = std.io.getStdOut();
const writer = stdout.writer();
{
const ITERATION = 10;
const LEN = 128;
const foo_soa = try FooSoa(u31, LEN).init();
const alives = foo_soa.alives.slice();
try writer.print("foo_soa : ITERATION {d}, LEN {d}\n", .{ ITERATION, LEN });
var timer = try Timer.start();
for (0..ITERATION) |_| {
for (alives) |foo| {
std.mem.doNotOptimizeAway(foo);
}
const lap = timer.lap();
try writer.print("foo_soa : time {d}us\n", .{(lap / std.time.ns_per_us)});
}
}
{
const ITERATION = 10;
const LEN = 256;
const foo_soa = try FooSoa(u31, LEN).init();
const alives = foo_soa.alives.slice();
try writer.print("foo_soa : ITERATION {d}, LEN {d}\n", .{ ITERATION, LEN });
var timer = try Timer.start();
for (0..ITERATION) |_| {
for (alives) |foo| {
std.mem.doNotOptimizeAway(foo);
}
const lap = timer.lap();
try writer.print("foo_soa : time {d}us\n", .{(lap / std.time.ns_per_us)});
}
}
{
const ITERATION = 10;
const LEN = 1024;
const foo_soa = try FooSoa(u31, LEN).init();
const alives = foo_soa.alives.slice();
try writer.print("foo_soa : ITERATION {d}, LEN {d}\n", .{ ITERATION, LEN });
var timer = try Timer.start();
for (0..ITERATION) |_| {
for (alives) |foo| {
std.mem.doNotOptimizeAway(foo);
}
const lap = timer.lap();
try writer.print("foo_soa : time {d}us\n", .{(lap / std.time.ns_per_us)});
}
}
}
the result when using poop are :
❯ poop ./foo_soa ./foo_aos
Benchmark 1 (10000 runs): ./foo_soa
measurement mean ± σ min … max outliers delta
wall_time 329us ± 21.0us 302us … 982us 823 ( 8%) 0%
peak_rss 526KB ± 180KB 184KB … 815KB 0 ( 0%) 0%
cpu_cycles 36.7K ± 1.02K 35.3K … 61.6K 491 ( 5%) 0%
instructions 69.7K ± 5.06 69.7K … 69.8K 24 ( 0%) 0%
cache_references 499 ± 50.1 399 … 1.08K 392 ( 4%) 0%
cache_misses 177 ± 18.1 129 … 388 236 ( 2%) 0%
branch_misses 479 ± 15.5 453 … 561 786 ( 8%) 0%
Benchmark 2 (10000 runs): ./foo_aos
measurement mean ± σ min … max outliers delta
wall_time 338us ± 19.1us 314us … 555us 1062 (11%) 💩+ 2.8% ± 0.2%
peak_rss 840KB ± 2.30KB 733KB … 840KB 102 ( 1%) 💩+ 59.6% ± 0.7%
cpu_cycles 37.2K ± 972 35.8K … 61.0K 556 ( 6%) 💩+ 1.6% ± 0.1%
instructions 69.7K ± 4.47 69.7K … 69.8K 19 ( 0%) - 0.0% ± 0.0%
cache_references 815 ± 79.0 637 … 1.89K 511 ( 5%) 💩+ 63.3% ± 0.4%
cache_misses 223 ± 23.8 164 … 449 179 ( 2%) 💩+ 25.8% ± 0.3%
branch_misses 481 ± 15.9 459 … 558 878 ( 9%) + 0.3% ± 0.1%
This is very amateurish benchmark, and doesn’t really tells you how it will perform within an entire piece of complete software, but it’s meant to show that this design does reduce cache misses.
and if I set the LEN to 4096 that will exceed my L1 size the Results are dramatic this time.
❯ poop ./foo_soa ./foo_aos
Benchmark 1 (10000 runs): ./foo_soa
measurement mean ± σ min … max outliers delta
wall_time 317us ± 21.7us 288us … 987us 871 ( 9%) 0%
peak_rss 513KB ± 168KB 184KB … 823KB 0 ( 0%) 0%
cpu_cycles 52.5K ± 2.10K 51.6K … 109K 509 ( 5%) 0%
instructions 169K ± 1.66 169K … 169K 893 ( 9%) 0%
cache_references 575 ± 58.4 399 … 1.24K 327 ( 3%) 0%
cache_misses 159 ± 17.9 113 … 320 287 ( 3%) 0%
branch_misses 170 ± 11.5 153 … 235 865 ( 9%) 0%
Benchmark 2 (10000 runs): ./foo_aos
measurement mean ± σ min … max outliers delta
wall_time 328us ± 19.4us 306us … 549us 1088 (11%) 💩+ 3.7% ± 0.2%
peak_rss 840KB ± 2.18KB 651KB … 840KB 103 ( 1%) 💩+ 63.8% ± 0.6%
cpu_cycles 55.2K ± 1.68K 54.3K … 115K 423 ( 4%) 💩+ 5.2% ± 0.1%
instructions 169K ± 2.20 169K … 169K 933 ( 9%) + 0.0% ± 0.0%
cache_references 5.25K ± 155 4.59K … 6.31K 2119 (21%) 💩+812.5% ± 0.6%
cache_misses 337 ± 129 207 … 1.40K 2024 (20%) 💩+112.2% ± 1.6%
branch_misses 177 ± 10.4 157 … 232 721 ( 7%) 💩+ 3.8% ± 0.2%