FYI each u1 takes up 1 byte in an array (i.e. @sizeOf(u1) returns 1, @sizeOf([8]u1) is 8, see the @sizeOf docs for context).
Check out std.bit_set if you want a bit set.
FYI each u1 takes up 1 byte in an array (i.e. @sizeOf(u1) returns 1, @sizeOf([8]u1) is 8, see the @sizeOf docs for context).
Check out std.bit_set if you want a bit set.
Yeah, I didn’t go that far since, with a bitset I’d have to think “oh, more than 8 options? One more byte” and have to deal with the segmenting. If I do move forward with that, I presume I’d use DynamicBitSetUnmanaged as the size is runtime known, and I’d want to let it choose between Integer backed and array backed.
Independent of that though, what blew me away was that there was no way for the compiler to know how many definitions I had. If you look at the
const definitions: [3]FullDefinition = .{ .{...}, .{...}, .{...}}
I am not permitted to use _ in place of the three. I’d have thought since it’s all compile-time know it would’ve, especially since you can for strings (array of u8).
Separate to that, wonder if I would’ve been better off just storing a bunch of slices since everything is just strings. You’d still have to segment the stored slices by capture so that you swap out the right ones though.
This works:
const definitions = [_]FullDefinition{ .{...}, .{...}, .{...} };
Thanks, I had been improperly under the impression that StructName{ .. } was for single items and const thing: StructName = … was for arrays or pointers.
Am working on the converse of this thing at the moment, following the same approach of pre-calculating as much as possible to minimize memory request calls. Once again, a seeming hell. However, I did manage to complete it but missed an edge case.
The converse would be, you define a schema for your strings (for simplicity’s sake, just well-defined breaks partitioning the string). You then select segments and collapse along those.
E.g. say “us” is the “site” and the number is the “region.” you could have “us0”, “us1”, “us2”, “jp0”, “jp1”, “jp2”, etc.
If we group by “site”, this would collapse into two strings us* and jp*
My issue arose in that, say you have values that do not vary. E.g. a separator us-0, us-1, etc.
Since ‘-’ doesn’t actually vary, globbing it into the * is a needless loss of information.
Presently, my program yields us** jp** i.e. it globs both segments as they’re not in the grouped by.
I think the only way forward is to restart from scratch. My thoughts are:
Managed to finish the above. Pretty rats nest like, but it works.
The summary of usage is just, define a schema for your string. Your string must be able to be partitioned into segments that are either fixed length or variable length that are determined by some unique manner of termination.
Then, select which segments are to remain fixed, i.e. to group by. You can think of this as the segments that will not be globbed when flattening the entries. Included below is the full single function plus an example main.
const std = @import("std");
const FixedSegment = struct {
length: usize,
};
const VarSegment = struct {
terminator: * const fn (u8) bool,
};
const SegmentType = enum {
fixed,
variable,
};
const Segment = struct {
name: [] const u8,
definition: SegmentDefinition,
};
const SegmentDefinition = union (SegmentType) {
fixed: FixedSegment,
variable: VarSegment,
};
fn groupBy(
allocator: std.mem.Allocator,
entries: []const []const u8,
segments: []const Segment,
selected: []const []const u8,
writer: *std.Io.Writer
) !void {
const selected_segment_indices = try allocator.alloc(usize, selected.len);
defer allocator.free(selected_segment_indices);
for (selected, 0..) |name, i| {
for (segments, 0..) |segment, segment_idx| {
if (std.mem.eql(u8, name, segment.name)) {
selected_segment_indices[i] = segment_idx;
break;
}
} else {
return error.SelectedNonExistingSegment;
}
}
const transient_flags = try allocator.alloc(u1, segments.len);
defer allocator.free(transient_flags);
// NOTE: give up on having minimum permanent flags.
// Just easier to be full, especially when copying
// transient flags to permanent.
const permanent_flags = try allocator.alloc(u1, segments.len * entries.len);
defer allocator.free(permanent_flags);
const matches = try allocator.alloc(usize, entries.len);
defer allocator.free(matches);
@memset(transient_flags, 0);
@memset(permanent_flags, 0);
@memset(matches, 0);
var num_unique: usize = 0;
for (entries, 0..) |entry, considered_idx| {
var match_idx: usize = 0;
matcher: while (match_idx < num_unique): (match_idx += 1) {
var considered_rolling_start_idx: usize = 0;
var existing_rolling_start_idx: usize = 0;
for (segments, 0..) |segment, segment_idx| {
switch (segment.definition) {
.fixed => |fixed_segment| {
const considered_slice = entry[considered_rolling_start_idx..(considered_rolling_start_idx + fixed_segment.length)];
const existing_idx = matches[match_idx];
const existing_slice = entries[existing_idx][existing_rolling_start_idx..(existing_rolling_start_idx + fixed_segment.length)];
transient_flags[segment_idx] = if (std.mem.eql(u8, considered_slice, existing_slice)) 0 else 1;
considered_rolling_start_idx += fixed_segment.length;
existing_rolling_start_idx += fixed_segment.length;
},
.variable => |var_segment| {
var considered_end_idx = considered_rolling_start_idx;
while (considered_end_idx < entry.len and !var_segment.terminator(entry[considered_end_idx])) {
considered_end_idx += 1;
}
const considered_slice = if (considered_rolling_start_idx == considered_end_idx) entry[considered_rolling_start_idx..] else entry[considered_rolling_start_idx..considered_end_idx];
const existing_idx = matches[match_idx];
var existing_end_idx = existing_rolling_start_idx;
while (existing_end_idx < entry.len and !var_segment.terminator(entries[existing_idx][existing_end_idx])) {
existing_end_idx += 1;
}
const existing_slice = if (existing_rolling_start_idx == existing_end_idx) entries[existing_idx][existing_rolling_start_idx..] else entry[existing_rolling_start_idx..existing_end_idx];
transient_flags[segment_idx] = if (std.mem.eql(u8, considered_slice, existing_slice)) 0 else 1;
considered_rolling_start_idx = considered_end_idx;
existing_rolling_start_idx = existing_end_idx;
},
}
}
var selected_idx: usize = 0;
outer: while (selected_idx < selected.len): (selected_idx += 1) {
const segment_idx = selected_segment_indices[selected_idx];
for (transient_flags, 0..) |value, flag_idx| {
if (flag_idx == segment_idx and value == 1) {
// did not match, that's ok, still have others to try against.
break :outer;
}
}
} else {
// did match for all, merge transient and permanent.
for (segments, 0..) |_, segment_idx| {
permanent_flags[match_idx * segments.len + segment_idx] |= transient_flags[segment_idx];
}
break :matcher;
}
@memset(transient_flags, 0);
} else {
matches[num_unique] = considered_idx;
num_unique += 1;
}
}
var i: usize = 0;
while (i < num_unique): (i += 1) {
const entry_idx = matches[i];
const entry = entries[entry_idx];
var rolling_start_idx: usize = 0;
for (segments, 0..) |segment, segment_idx| {
var offset: usize = 0;
switch (segment.definition) {
.fixed => |fixed_segment| {
offset = fixed_segment.length;
},
.variable => |var_segment| {
while (rolling_start_idx + offset < entry.len and !var_segment.terminator(entry[rolling_start_idx + offset])) {
offset += 1;
}
}
}
// if we're on a selected index or if permanent_flag is 0, print out
const selected_segment = for (selected_segment_indices) |acceptable_idx| {
if (segment_idx == acceptable_idx) {
break true;
}
} else false;
if (selected_segment or permanent_flags[i * segments.len + segment_idx] == 0) {
const slice = if (rolling_start_idx + offset > entry.len) entry[rolling_start_idx..] else entry[rolling_start_idx..(rolling_start_idx + offset)];
try writer.print("{s}", .{slice});
} else {
try writer.print("*", .{});
}
rolling_start_idx += offset;
}
try writer.print("\n", .{});
}
try writer.flush();
}
pub fn main() !void {
const entries: [8][]const u8 = .{
"usdev-front0",
"usdev-front1",
"usdev-front2",
"usdev-front3",
"usdev-back0",
"usdev-back1",
"usdev-back2",
"usdev-back3",
};
const selected: [2][]const u8 = .{
"env",
"prefix",
};
const segments = [_]Segment{
.{
.name = "site",
.definition = .{ .fixed = .{ .length = 2 } }
},
.{
.name = "env",
.definition = .{ .fixed = .{ .length = 3 } }
},
.{
.name = "sep",
.definition = .{ .fixed = .{ .length = 1 } }
},
.{
.name = "prefix",
.definition = .{ .variable = .{ .terminator = std.ascii.isDigit } }
},
.{
.name = "count",
.definition = .{ .variable = .{ .terminator = std.ascii.isAlphanumeric } }
},
};
const GPA = std.heap.GeneralPurposeAllocator(.{});
var gpa: GPA = .init;
const allocator = gpa.allocator();
var buffer: [512]u8 = undefined;
var writer = std.fs.File.stdout().writer(&buffer);
try groupBy(allocator, &entries, &segments, &selected, &writer.interface);
}