Iāve watched @Validark 's presentaions on parsing, and solved your example as a practice problem, wonāt claim itās easily understandable, but here it goes anyway:
In your case to split a alphanumeric sequence into letters and numbers you can:
- switch on the first byte
- create a bitstring of all the subsequent bytes of the same type
- count trailing ones to get the ending position of the first token, emit token, advance buffer cursor by token length and repeat
Here is the example tokenizer from the second presentation.
Quick example:
const std = @import("std");
pub fn main() void {
var buf: [512]u8 = @splat(0);
const V = @Vector(16, u8);
const fen = "foobar123baz456/";
@memcpy(buf[0..fen.len], fen);
var cur: []const u8 = buf[0..];
var vec: V = cur[0..16].*;
// first byte letter so make letter bitstring
var bitstring: u16 =
(@as(u16, @bitCast(@as(V, @splat('a')) <= vec)) & @as(u16, @bitCast(vec <= @as(V, @splat('z')))));
var len = @ctz(~bitstring);
std.debug.print(
\\{s} vector
\\{b} bitstring in reverse
\\len == {d}
\\token{{ .type = .identifier .slice = {s} }}
\\
\\
, .{
cur[0..16],
@bitReverse(bitstring),
len,
cur[0..len],
});
cur = cur[len..];
vec = cur[0..16].*;
// first byte number so make number bitstring
bitstring =
(@as(u16, @bitCast(@as(V, @splat('0')) <= vec)) & @as(u16, @bitCast(vec <= @as(V, @splat('9')))));
len = @ctz(~bitstring);
std.debug.print(
\\{s}
\\{b}
\\len == {d}
\\token{{ .type = .number .slice = {s} }}
\\
, .{
cur[0..16],
@bitReverse(bitstring),
len,
cur[0..len],
});
}
Result:
foobar123baz456/ vector
1111110001110000 bitstring in reverse
len == 6
token{ .type = .identifier .slice = foobar }
123baz456/\0\0\0\0\0\0
1110001110000000
len == 3
token{ .type = .number .slice = 123 }
This is wasteful, you donāt need to recalculate the bitstring everytime, so later in the presentation thereās a much harder to understand example (presentations/hyper-optimizing-zig-parser/#/my-tokenizer-code), that reuses the bitstrings.
As a challenge i rewrote it to fit your problem:
pub fn tokenize(gpa: std.mem.Allocator, source: []align(64) const u8) ![]Token {
var tokens: std.ArrayListUnmanaged(Token) = .empty;
errdefer tokens.deinit(gpa);
var prev: []const u8 = source[0..];
var cur = prev[0..];
var op_type: Tag = switch (cur[0]) {
'a'...'z', 'A'...'Z' => .identifier,
'0'...'9' => .number,
' ', '\t', '\n' => .whitespace,
else => return error.UnexpectedStart,
};
var bitstrings: [4]u64 = undefined;
var selected_bitmap: *u64 = &bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
var bitmap_index = @intFromPtr(source.ptr);
bitmap_index -%= 64;
while (true) {
var aligned_ptr = @intFromPtr(cur.ptr) / 64 * 64;
while (true) {
while (bitmap_index != aligned_ptr) {
bitmap_index +%= 64;
const V = @Vector(64, u8);
const vec: V = @as([*]align(64) const u8, @ptrFromInt(bitmap_index))[0..64].*;
bitstrings[0] =
(@as(usize, @bitCast(@as(V, @splat('a')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('z'))))) |
(@as(usize, @bitCast(@as(V, @splat('A')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('Z')))));
bitstrings[1] =
(@as(usize, @bitCast(@as(V, @splat('0')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('9')))));
bitstrings[2] =
@as(usize, @bitCast(vec == @as(V, @splat('\t')))) |
@as(usize, @bitCast(vec == @as(V, @splat('\n')))) |
@as(usize, @bitCast(vec == @as(V, @splat(' '))));
bitstrings[3] = ~(bitstrings[0] | bitstrings[1] | bitstrings[2]);
}
const cur_misalignment: std.math.Log2Int(u64) = @truncate(@intFromPtr(cur.ptr));
const bitstring = selected_bitmap.* >> cur_misalignment;
cur = cur[@ctz(~bitstring)..];
// If we made it to the end of chunk(s), wrap around, grab more bits, and start again
aligned_ptr = @intFromPtr(cur.ptr) / 64 * 64;
if (bitmap_index == aligned_ptr) break;
}
if (op_type == .eof) break;
const len: u8 = @intCast(@intFromPtr(cur.ptr) - @intFromPtr(prev.ptr));
try tokens.append(gpa, .{ .len = len, .kind = op_type });
prev = cur;
op_type = sw: switch (cur[0]) {
'a'...'z', 'A'...'Z', '_' => .identifier,
'0'...'9' => .number,
'/' => {
try tokens.append(gpa, .{ .len = 1, .kind = .slash });
cur = cur[1..];
prev = cur;
continue :sw cur[0];
},
' ', '\t', '\n' => .whitespace,
0 => .eof,
else => .unknown,
};
selected_bitmap = &bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
}
try tokens.append(gpa, .{ .len = 0, .kind = op_type });
return tokens.toOwnedSlice(gpa);
}
const Tag = enum(u8) {
identifier = 0,
number = 1,
whitespace = 2,
unknown = 3,
slash,
eof,
};
const Token = struct {
kind: Tag,
len: u8,
};
pub fn main() !void {
var debug_allocator: std.heap.DebugAllocator(.{}) = .init;
const gpa = debug_allocator.allocator();
defer _ = debug_allocator.deinit();
const stdout = std.io.getStdOut().writer();
const fen = "foo/123/foo123bar/123foo/foo123bar456//123foo456 player GarriŠŠ°ŃŃŠø!KasparowŠŠ°ŃŠæŠ°ŃŠ¾Š²! 420 1337 96";
const overaligned_size = std.mem.alignForward(u64, fen.len, 64);
const buffer = try gpa.alignedAlloc(u8, .@"64", overaligned_size);
defer gpa.free(buffer);
// we rely on a nullbyte to know when to stop parsing
if (builtin.mode == .Debug) @memset(buffer, 0);
@memcpy(buffer[0..fen.len], fen[0..]);
const tokens = try tokenize(gpa, buffer);
defer gpa.free(tokens);
var cur: []const u8 = buffer[0..];
for (tokens) |token| {
try stdout.print("{s}<", .{@tagName(token.kind)[0..1]});
try stdout.print("{s}> ", .{cur[0..token.len]});
if (token.kind == .slash or token.kind == .whitespace) try stdout.writeByte('\n');
cur = cur[token.len..];
}
try stdout.writeByte('\n');
}
const std = @import("std");
const builtin = @import("builtin");
Result for input āfoo/123/foo123bar/123foo/foo123bar456//123foo456 player GarriŠŠ°ŃŃŠø!KasparowŠŠ°ŃŠæŠ°ŃŠ¾Š²! 420 1337 96ā
i<foo> s</>
n<123> s</>
i<foo> n<123> i<bar> s</>
n<123> i<foo> s</>
i<foo> n<123> i<bar> n<456> s</>
s</>
n<123> i<foo> n<456> w< >
i<player> w< >
i<Garri> u<ŠŠ°ŃŃŠø!> i<Kasparow> u<ŠŠ°ŃŠæŠ°ŃŠ¾Š²!> w< >
n<420> w< >
n<1337> w< >
n<96> e<>
I think these tokens are what you want, here printed as the first letter of the token type (i = identifier, n = number) and their content in the source buffer.
I have no clue about unicode (the u stands for unknown btw not unicode).
Needs master to compile (0.15.0-dev.646+ef35c3d5f).