I am trying to implement k-means clustering in Zig. I have 3 function: 1. get_mean which returns random points as centroids, 2. get_clusters which creates a k sized array of arraylists to hold the data points and 3. assign_cluster which assigns the datapoints to their respective clusters. The issue is that get_mean returns a slice and when I am looping over the datapoints for some reason the values in the mean array would all become 0.
Here are the functions:
pub fn closest(haystack: *[]f32, needle: f32) usize {
var idx: usize = 0;
var currentClosest: f32 = 100000000.0;
for (haystack.*, 0..) |thing, i| {
if (math.sqrt(math.pow(f32, needle - thing, 2)) < currentClosest) {
currentClosest = math.sqrt(math.pow(f32, needle - thing, 2));
idx = i;
//print("closest: {d:.2}\n", .{currentClosest});
//print("idx: {d}\n", .{idx});
}
}
return idx;
}
//Dummy dataset
pub const dataset = [_]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0 };
//Random Partition Kmeans Algorithm
//get the first random centroids of the dataset
pub fn get_means(comptime T: type, comptime k: usize, data: []const T) []T {
var means: [k]T = undefined;
for (0..k) |i| {
var cluster = data[rand.uintAtMost(usize, data.len - 1)];
while (inSlice(T, &means, cluster)) {
cluster = data[rand.uintAtMost(usize, data.len - 1)];
}
means[i] = cluster;
}
print("means: {d:.2}\n", .{means});
return &means;
}
pub fn make_clusters(comptime T: type, comptime k: usize, comptime allocator: std.mem.Allocator) ![]std.ArrayList(T) {
var clusters = try allocator.alloc(std.ArrayList(T), k);
for (0..k) |i| {
clusters[i] = std.ArrayList(T).init(allocator);
}
return clusters;
}
pub fn assign_cluster(comptime T: type, comptime data: []const T, clusters: []std.ArrayList(T)) ![]std.ArrayList(T) {
var means = get_means(T, 3, data);
print("means: {d:.2}\n", .{means});
for (data) |d| {
print("means in loop: {d:.2}\n", .{means});
var group = closest(&means, d);
try clusters[group].append(d);
}
return clusters;
}
pub fn main() !void {
var clusters = try make_clusters(f32, 3, gpa_allocator);
var group = try assign_cluster(f32, dataset[0..], clusters);
for (group, 0..) |c, i| {
print("cluster {d}: {d:.2}\n", .{ i + 1, c.items });
}
}
This is the output when the code is run:
means: { 2.00, 12.00, 15.00 }
means: { 2.00, 12.00, 15.00 }
means in loop: { 2.00, 12.00, 15.00 }
means in loop: { 0.00, -3705342704467114500000000000000000.00, 0.00 }
means in loop: { 0.00, -3705343942407154000000000000000000.00, 0.00 }
means in loop: { 0.00, -3705345180347193000000000000000000.00, 0.00 }
means in loop: { 0.00, -3705346418287232400000000000000000.00, 0.00 }
means in loop: { 0.00, -3705347656227271700000000000000000.00, 0.00 }
means in loop: { 0.00, -3705348894167311000000000000000000.00, 0.00 }
means in loop: { 0.00, -3705350132107350000000000000000000.00, 0.00 }
means in loop: { 0.00, -3705351370047389500000000000000000.00, 0.00 }
means in loop: { 0.00, -3702856920868229500000000000000000.00, 0.00 }
means in loop: { 0.00, -3702858158808269000000000000000000.00, 0.00 }
means in loop: { 0.00, -3702859396748308000000000000000000.00, 0.00 }
means in loop: { 0.00, -3702860634688347400000000000000000.00, 0.00 }
means in loop: { 0.00, -3702861872628386600000000000000000.00, 0.00 }
means in loop: { 0.00, -3702863110568426000000000000000000.00, 0.00 }
means in loop: { 0.00, -3702864348508465000000000000000000.00, 0.00 }
means in loop: { 0.00, -3702865586448504500000000000000000.00, 0.00 }
means in loop: { 0.00, -3702866824388544000000000000000000.00, 0.00 }
means in loop: { 0.00, -3702868062328583000000000000000000.00, 0.00 }
means in loop: { 0.00, -3702869300268622400000000000000000.00, 0.00 }
means in loop: { 0.00, -3702870538208661600000000000000000.00, 0.00 }
means in loop: { 0.00, -3710438065668813000000000000000000.00, 0.00 }
means in loop: { 0.00, -3710439303608852500000000000000000.00, 0.00 }
cluster 1: { 1.00, 2.00, 3.00, 4.00, 5.00, 6.00, 7.00, 8.00, 9.00, 10.00, 11.00, 12.00, 13.00, 14.00, 15.00, 16.00, 17.00, 18.00, 19.00, 20.00, 21.00, 22.00, 23.00 }
cluster 2: { }
cluster 3: { }
[Process exited 0]
I’m not entirely sure this is happening but I’m guessing it’s some pointer magic that I’m too dumb to understand. Any help would be appreciated.