Infix to Postfix Regex Function in C

dude_the_builder · November 19, 2023, 6:40pm

The following C program contains the re2post function that converts a regular expression pattern string using infix operators into a pattern string using postfix operators. I believe this is Ken Thompson’s implementation with tweaks by Russ Cox. I found converting this code into Zig difficult because it makes heavy use of a few C-isms, especially iteration over the elements of arrays via pointer arithmetic.

#include <stdio.h>
#include <string.h>

/*
 * Convert infix regexp re to postfix notation.
 * Insert . as explicit concatenation operator.
 * Cheesy parser, return static buffer.
 */
char*
re2post(char *re)
{
	int nalt, natom;
	static char buf[8000];
	char *dst;
	struct {
		int nalt;
		int natom;
	} paren[100], *p;
	
	p = paren;
	dst = buf;
	nalt = 0;
	natom = 0;
	if(strlen(re) >= sizeof buf/2)
		return NULL;
	for(; *re; re++){
		switch(*re){
		case '(':
			if(natom > 1){
				--natom;
				*dst++ = '.';
			}
			if(p >= paren+100)
				return NULL;
			p->nalt = nalt;
			p->natom = natom;
			p++;
			nalt = 0;
			natom = 0;
			break;
		case '|':
			if(natom == 0)
				return NULL;
			while(--natom > 0)
				*dst++ = '.';
			nalt++;
			break;
		case ')':
			if(p == paren)
				return NULL;
			if(natom == 0)
				return NULL;
			while(--natom > 0)
				*dst++ = '.';
			for(; nalt > 0; nalt--)
				*dst++ = '|';
			--p;
			nalt = p->nalt;
			natom = p->natom;
			natom++;
			break;
		case '*':
		case '+':
		case '?':
			if(natom == 0)
				return NULL;
			*dst++ = *re;
			break;
		default:
			if(natom > 1){
				--natom;
				*dst++ = '.';
			}
			*dst++ = *re;
			natom++;
			break;
		}
	}
	if(p != paren)
		return NULL;
	while(--natom > 0)
		*dst++ = '.';
	for(; nalt > 0; nalt--)
		*dst++ = '|';
	*dst = 0;
	return buf;
}

int main(void) {
	printf("%s\n", re2post("a(b|c)+d"));
}

Particularly difficult to understand is the handling of the paren array of structs, which seems to me is basically a stack, but the implicit use of default values for ints makes it hard to follow to a non-C programmer like me.

Compiled with

zig cc re2post.c -o re2post

and run with

./re2post

it produces the output:

abc|+.d.

IntegratedQuantum · November 19, 2023, 7:29pm

Here is my translation approach.
Essentially I decided to use ArrayLists, allocations and errors instead of pointer arithmetic, fixed size buffers and null. Overall that simplified the code a bit and made it more readable. It probably makes it less performant though.

I kept most of the original naming scheme, honestly I don’t even understand some of these(nalt? natom?) so I decided it’s best to stick with the original names.

const std = @import("std");

fn re2post(allocator: std.mem.Allocator, re: []const u8) ![]const u8 {
	var dst = std.ArrayList(u8).init(allocator);
	errdefer dst.deinit();
	const N = struct {
		alt: u32 = 0,
		atom: u32 = 0,
	};
	var cur_n: N = .{};
	var stack = std.ArrayList(N).init(allocator);
	defer stack.deinit();
	
	for(re) |cur_byte| {
		switch(cur_byte) {
			'(' => {
				if(cur_n.atom > 1) {
					cur_n.atom -= 1;
					try dst.append('.');
				}
				try stack.append(cur_n);
				cur_n = .{};
			},
			'|' => {
				if(cur_n.atom == 0) return error.ParserError;
				try dst.appendNTimes('.', cur_n.atom - 1);
				cur_n.atom = 0;
				cur_n.alt += 1;
			},
			')' => {
				if(cur_n.atom == 0) return error.ParserError;
				try dst.appendNTimes('.', cur_n.atom - 1);
				try dst.appendNTimes('|', cur_n.alt);
				cur_n = stack.popOrNull() orelse return error.ParserError;
				cur_n.atom += 1;
			},
			'*', '+', '?' => {
				if(cur_n.atom == 0) return error.ParserError;
				try dst.append(cur_byte);
			},
			else => {
				if(cur_n.atom > 1) {
					cur_n.atom -= 1;
					try dst.append('.');
				}
				try dst.append(cur_byte);
				cur_n.atom += 1;
			},
		}
	}
	if(stack.items.len != 0) return error.ParserError;
	try dst.appendNTimes('.', cur_n.atom - 1);
	try dst.appendNTimes('|', cur_n.alt);
	return try dst.toOwnedSlice();
}

pub fn main() !void {
	std.log.info("{s}", .{try re2post(std.heap.page_allocator, "a(b|c)+d")});
}

You can simply run it with zig run re2post.zig