general remarks
I think using heavily nested if-else
makes this code look more complicated than it is, by using early exit instead of if-else chains and using if-expressions we can get something more succinct that also looks less intimidating.
Another thing you need to apply is creating and using helper functions, to avoid repeating the same code, via multiple copy-paste. Not saying that everything deserves a helper function, but sometimes it is really useful.
I tried to create a very in-depth step-by-step explanation.
sidenote
I haven’t really run the code, so I could have made a small mistake somewhere, but I didn’t really want to spend the additional time to come up with my own example code and replace the external import of types
.
I think it is better to either include these things in the future or stub them out in some way (just makes it more likely, that someone engages with it, when they don’t have to invest additional time to create a good example use of the data structure).
detailed explanation
From this:
fn search(self: *Node, uri: []const u8) RESULT {
if (uri[0] < self.value) {
if (self.left) |left| {
return left.search(uri);
} else {
return RESULT.COULD_NOT_BE_FOUND;
}
} else if (uri[0] > self.value) {
if (self.right) |right| {
return right.search(uri);
} else {
return RESULT.COULD_NOT_BE_FOUND;
}
} else {
if (uri.len == 1) {
return RESULT.FOUND;
} else {
if (self.equal) |equal| {
return equal.search(uri[1..]);
} else {
return RESULT.COULD_NOT_BE_FOUND;
}
}
}
}
To this, by applying early-exit and if-expressions:
fn search(self: *Node, uri: []const u8) RESULT {
const success = RESULT.FOUND;
const fail = RESULT.COULD_NOT_BE_FOUND;
if (uri[0] < self.value) return if (self.left) |left| left.search(uri) else fail;
if (uri[0] > self.value) return if (self.right) |right| right.search(uri) else fail;
if (uri.len == 1) return success;
return if (self.equal) |equal| equal.search(uri[1..]) else fail;
}
I think this is much easier to read because with every line the function either exits and is done or it continues to the next line.
Additionally because I pulled out success
and fail
into named constants it is also easy to see that search_for_node is essentially the same function (minus small type differences).
fn search_for_node(self: *Node, uri: []const u8) ?*Node {
const success = self;
const fail = null;
if (uri[0] < self.value) return if (self.left) |left| left.search_for_node(uri) else fail;
if (uri[0] > self.value) return if (self.right) |right| right.search_for_node(uri) else fail;
if (uri.len == 1) return success;
return if (self.equal) |equal| equal.search_for_node(uri[1..]) else fail;
}
Now I was wondering what is the real value of having both search_for_node
and search
?
search_for_node
already gives you found / not-found via its optional, so if we want to keep the api we also can do this:
fn search(self: *Node, uri: []const u8) RESULT {
if (self.search_for_node(uri)) |_| RESULT.FOUND else RESULT.COULD_NOT_BE_FOUND;
}
fn search_for_node(self: *Node, uri: []const u8) ?*Node {
const success = self;
const fail = null;
if (uri[0] < self.value) return if (self.left) |left| left.search_for_node(uri) else fail;
if (uri[0] > self.value) return if (self.right) |right| right.search_for_node(uri) else fail;
if (uri.len == 1) return success;
return if (self.equal) |equal| equal.search_for_node(uri[1..]) else fail;
}
At this point I probably would remove the names again (which I only introduced to see whether the code is basically the same).
We can do similar things with the add
function.
First lets pull out the repeating code of creating a new node:
fn createNode(uri: []const u8, allocater: std.mem.Allocator) *Node {
var new_node = try allocater.create(Node);
new_node.init(uri[0], allocater);
if (uri.len > 1) {
if (uri[1] == '/') {
new_node.string_end = true;
}
}
return new_node;
}
I don’t understand why only one of the branches contains the if (uri.len > 1) {
condition, I think the reasoning behind this may be that in the other branches it always happens to be true, but I don’t think such a small condition is a good reason to duplicate the code in 3 branches. And I doubt it results in a big optimization.
Now lets remove the nested ifs (if’s that only set booleans can easily be removed):
fn createNode(uri: []const u8, allocater: std.mem.Allocator) !*Node {
var new_node = try allocater.create(Node);
new_node.init(uri[0], allocater);
if (uri.len > 1 and uri[1] == '/') {
new_node.string_end = true;
}
return new_node;
}
fn createNode(uri: []const u8, allocater: std.mem.Allocator) !*Node {
var new_node = try allocater.create(Node);
new_node.init(uri[0], allocater);
new_node.string_end = uri.len > 1 and uri[1] == '/';
return new_node;
}
Now we can rewrite the add
function:
fn add(self: *Node, uri: []const u8, allocater: std.mem.Allocator) !RESULT {
if (uri[0] < self.value) {
if (self.left) |left| {
return try left.add(uri, self.allocater);
} else {
self.left = try createNode(uri, allocater);
return self.left.add(uri, allocater);
}
} else if (uri[0] > self.value) {
if (self.right) |right| {
return try right.add(uri, self.allocater);
} else {
self.right = try createNode(uri, allocater);
return self.right.add(uri, allocater);
}
} else {
if (uri.len == 1) {
self.string_end = true;
return RESULT.ADDED;
} else {
if (self.equal) |equal| {
return equal.add(uri[1..], allocater);
} else {
var new_node = try allocater.create(Node);
new_node.init(uri[1], allocater);
if (uri[1] == '/') {
new_node.string_end = true;
}
self.equal = try createNode(); // NOTE this is where I noticed that our createNode function isn't quite right
return new_node.add(uri[1..], allocater);
}
}
}
}
The note shows the point where I noticed that init gets called with uri[1]
instead of with uri[0]
at this point, so we need to make a small change to the createNode
function:
fn createNode(uri: []const u8, val:u8, allocater: std.mem.Allocator) !*Node {
var new_node = try allocater.create(Node);
new_node.init(val, allocater);
new_node.string_end = uri.len > 1 and uri[1] == '/';
return new_node;
}
Now back to add
:
fn add(self: *Node, uri: []const u8, allocater: std.mem.Allocator) !RESULT {
if (uri[0] < self.value) {
if (self.left) |left| {
return try left.add(uri, self.allocater);
} else {
self.left = try createNode(uri, uri[0], allocater);
return self.left.add(uri, allocater);
}
} else if (uri[0] > self.value) {
if (self.right) |right| {
return try right.add(uri, self.allocater);
} else {
self.right = try createNode(uri, uri[0], allocater);
return self.right.add(uri, allocater);
}
} else {
if (uri.len == 1) {
self.string_end = true;
return RESULT.ADDED;
} else {
if (self.equal) |equal| {
return equal.add(uri[1..], allocater);
} else {
self.equal = try createNode(uri, uri[1], allocater);
return self.equal.add(uri[1..], allocater);
}
}
}
}
If you stare at it a bit you realize that there is still a lot of repetition here.
Lets take this for example:
if (self.left) |left| {
return try left.add(uri, self.allocater);
} else {
self.left = try createNode(uri, uri[0], allocater);
return self.left.add(uri, allocater);
}
The first try is redundant with the return:
if (self.left) |left| {
return left.add(uri, self.allocater);
} else {
self.left = try createNode(uri, uri[0], allocater);
return self.left.add(uri, allocater);
}
Now if we always had self.left
we could pull out the return self.left.add(uri, allocater);
from the branches, so a different way to look at it is:
if (self.left) |_| {
} else {
self.left = try createNode(uri, uri[0], allocater);
}
return self.left.add(uri, allocater);
Or:
if (self.left == null) self.left = try createNode(uri, uri[0], allocater);
return self.left.add(uri, allocater);
Again we can pull this out into a helper function:
fn ensureNode(node:*?*Node, uri:[]const u8, offset:u8, allocater:std.mem.Allocator) !RESULT {
if (node.* == null) node.* = try createNode(uri, uri[offset], allocater);
return node.*.?.add(uri[offset..], allocater);
}
And rewrite add
:
fn add(self: *Node, uri: []const u8, allocater: std.mem.Allocator) !RESULT {
if (uri[0] < self.value) {
return ensureNode(&self.left, uri, 0, allocater);
} else if (uri[0] > self.value) {
return ensureNode(&self.right, uri, 0, allocater);
} else {
if (uri.len == 1) {
self.string_end = true;
return RESULT.ADDED;
} else {
return ensureNode(&self.equal, uri, 1, allocater);
}
}
}
Use early exit:
fn add(self: *Node, uri: []const u8, allocater: std.mem.Allocator) !RESULT {
if (uri[0] < self.value) return ensureNode(&self.left, uri, 0, allocater);
if (uri[0] > self.value) return ensureNode(&self.right, uri, 0, allocater);
if (uri.len == 1) {
self.string_end = true;
return RESULT.ADDED;
}
return ensureNode(&self.equal, uri, 1, allocater);
}
Result:
fn search(self: *Node, uri: []const u8) RESULT {
if (self.search_for_node(uri)) |_| RESULT.FOUND else RESULT.COULD_NOT_BE_FOUND;
}
fn search_for_node(self: *Node, uri: []const u8) ?*Node {
if (uri[0] < self.value) return if (self.left) |left| left.search_for_node(uri) else null;
if (uri[0] > self.value) return if (self.right) |right| right.search_for_node(uri) else null;
if (uri.len == 1) return self;
return if (self.equal) |equal| equal.search_for_node(uri[1..]) else null;
}
fn createNode(uri: []const u8, val: u8, allocater: std.mem.Allocator) !*Node {
var new_node = try allocater.create(Node);
new_node.init(val, allocater);
new_node.string_end = uri.len > 1 and uri[1] == '/';
return new_node;
}
fn ensureNode(node: *?*Node, uri: []const u8, offset: u8, allocater: std.mem.Allocator) !RESULT {
if (node.* == null) node.* = try createNode(uri, uri[offset], allocater);
return node.*.?.add(uri[offset..], allocater);
}
// performs a search
// HOWEVER, if string is not found will add it to the trie
fn add(self: *Node, uri: []const u8, allocater: std.mem.Allocator) !RESULT {
if (uri[0] < self.value) return ensureNode(&self.left, uri, 0, allocater);
if (uri[0] > self.value) return ensureNode(&self.right, uri, 0, allocater);
if (uri.len == 1) {
self.string_end = true;
return RESULT.ADDED;
}
return ensureNode(&self.equal, uri, 1, allocater);
}
Unless I made a mistake somewhere, I haven’t changed anything about what the functions return.
One thing about the add
function, it seems it can only either fail with a Zig error, or return RESULT.ADDED
(even if the result already existed), thus the return value seems unneeded and I would probably change the function to just return !void
.
Alternatively you could keep track of whether something actually was added and otherwise return RESULT.FOUND
but that would be a bit more work.
I would probably do that by splitting the function into add
and addHelper
, add
would declare a var added = false;
and pass a reference &added
as argument to addHelper
which passes it to ensureNode
to set it to true if it creates a new node, ensureNode
would then call addHelper
instead of add
.