Why is my Zig program so much slower than my C program?

Writing a program using TurboJPEG. I have a C version and a Zig version. The C version completely blows the Zig version out of the water. What am I doing wrong?
Zig code:

const std = @import("std");
const c = @cImport({
    @cInclude("turbojpeg.h");
});
const stdout = std.io.getStdOut().writer();

// const density = "N@#W$987654321!abc;:+=-,._ ";

pub fn main() !void {
    const allocator = std.heap.c_allocator;
    var arena = std.heap.ArenaAllocator.init(allocator);
    defer arena.deinit();
    const alloc = arena.allocator();

    if (std.os.argv.len != 2) {
        std.debug.print("Usage: {s} <JPEG file>\n", .{std.os.argv[0]});
        return;
    }
    var args = std.process.ArgIteratorPosix.init();
    _ = args.skip();

    const filename = args.next().?;
    var file = try std.fs.cwd().openFile(filename, .{});
    defer file.close();

    const file_size = try file.getEndPos();
    const jpeg_buf = try alloc.alloc(u8, file_size);
    defer alloc.free(jpeg_buf);

    _ = try file.read(jpeg_buf);

    const tj_instance = c.tjInitDecompress();
    if (tj_instance == null) {
        std.debug.print("Error initializing TurboJPEG decompressor: {s}\n", .{c.tjGetErrorStr()});
        return;
    }

    var width: c_int = 0;
    var height: c_int = 0;
    var jpeg_subsamp: c_int = 0;
    var jpeg_colorspace: c_int = 0;
    const jpeg_len: c_ulong = @intCast(jpeg_buf.len);
    if (c.tjDecompressHeader3(tj_instance, jpeg_buf.ptr, jpeg_len, &width, &height, &jpeg_subsamp, &jpeg_colorspace) != 0) {
        std.debug.print("Error reading JPEG header: {s}\n", .{c.tjGetErrorStr()});
        return;
    }

    const pixel_size = 3;
    const img_buf_size: usize = @intCast(width * height * pixel_size);
    const img_buf = try alloc.alloc(u8, img_buf_size);

    if (c.tjDecompress2(tj_instance, jpeg_buf.ptr, jpeg_len, img_buf.ptr, width, 0, height, c.TJPF_RGB, 0) != 0) {
        std.debug.print("Error decompressing JPEG image: {s}\n", .{c.tjGetErrorStr()});
        return;
    }

    _ = c.tjDestroy(tj_instance);
    var i: usize = 0;
    while (i < width * height) : (i += 1) {
        const r: u32 = img_buf[i * pixel_size];
        const g: u32 = img_buf[i * pixel_size + 1];
        const b: u32 = img_buf[i * pixel_size + 2];
        const rf: f32 = @floatFromInt(r);
        const gf: f32 = @floatFromInt(g);
        const bf: f32 = @floatFromInt(b);
        const rgray: f32 = 0.299 * rf;
        const ggray: f32 = 0.587 * gf;
        const bgray: f32 = 0.114 * bf;
        const gray: u8 = @intFromFloat(rgray + ggray + bgray);
        try stdout.print("Pixel {d}: Brightness={d}\n", .{ i, gray });
        // std.debug.print("Pixel {d}: Brightness={d}\n", .{ i, gray });
        // std.debug.print("Pixel {d}: R={d}, G={d}, B={d}\n", .{i, img_buf[i * pixel_size], img_buf[i * pixel_size + 1], img_buf[i * pixel_size + 2]});
    }
    // std.debug.print("Image dimensions: {d} x {d}\n", .{ width, height });
}

C:

#include <stdio.h>
#include <stdlib.h>
#include <turbojpeg.h>
#include <string.h>

int main(int argc, char *argv[]) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <JPEG file>\n", argv[0]);
        return EXIT_FAILURE;
    }

    const char *filename = argv[1];
    FILE *jpegFile = fopen(filename, "rb");
    if (!jpegFile) {
        perror("Error opening file");
        return EXIT_FAILURE;
    }

    fseek(jpegFile, 0, SEEK_END);
    long jpegSize = ftell(jpegFile);
    fseek(jpegFile, 0, SEEK_SET);

    unsigned char *jpegBuf = (unsigned char *)malloc(jpegSize);
    if (!jpegBuf) {
        perror("Error allocating memory");
        fclose(jpegFile);
        return EXIT_FAILURE;
    }
    fread(jpegBuf, jpegSize, 1, jpegFile);
    fclose(jpegFile);

    tjhandle tjInstance = tjInitDecompress();
    if (!tjInstance) {
        fprintf(stderr, "Error initializing TurboJPEG decompressor: %s\n", tjGetErrorStr());
        free(jpegBuf);
        return EXIT_FAILURE;
    }

    int width, height, jpegSubsamp, jpegColorspace;
    if (tjDecompressHeader3(tjInstance, jpegBuf, jpegSize, &width, &height, &jpegSubsamp, &jpegColorspace) != 0) {
        fprintf(stderr, "Error reading JPEG header: %s\n", tjGetErrorStr());
        tjDestroy(tjInstance);
        free(jpegBuf);
        return EXIT_FAILURE;
    }

    printf("Image dimensions: %d x %d\n", width, height);

    unsigned char *imgBuf = (unsigned char *)malloc(width * height * tjPixelSize[TJPF_RGB]);
    if (!imgBuf) {
        perror("Error allocating memory");
        tjDestroy(tjInstance);
        free(jpegBuf);
        return EXIT_FAILURE;
    }

    if (tjDecompress2(tjInstance, jpegBuf, jpegSize, imgBuf, width, 0 /* pitch */, height, TJPF_RGB, 0) != 0) {
        fprintf(stderr, "Error decompressing JPEG image: %s\n", tjGetErrorStr());
        tjDestroy(tjInstance);
        free(jpegBuf);
        free(imgBuf);
        return EXIT_FAILURE;
    }

    tjDestroy(tjInstance);
    free(jpegBuf);

    for (int i = 0; i < width * height; i++) {
        int r = imgBuf[i * 3];
        int g = imgBuf[i * 3 + 1];
        int b = imgBuf[i * 3 + 2];
        unsigned char gray = (unsigned char)(0.299 * r + 0.587 * g + 0.114 * b);
        printf("Pixel %d: Brightness=%d\n", i, gray);
    }

    free(imgBuf);
    return EXIT_SUCCESS;
}

using gcc 13.2.1 20240210
zig 0.14.0-dev.43+96501d338

performance tests using hyperfine on a clean reboot:

hyperfine -r 10 "/home/wizard/fun/c/things/tjpg/decode_jpeg /home/wizard/downloads/suisei-pfp.jpg" "/home/wizard/fun/zig/learn/tag/zig-out/bin/tag /home/wizard/downloads/suisei-pfp.jpg"
Benchmark 1: /home/wizard/fun/c/things/tjpg/decode_jpeg /home/wizard/downloads/suisei-pfp.jpg
  Time (mean ± σ):     154.5 ms ±   4.0 ms    [User: 148.0 ms, System: 5.6 ms]
  Range (min … max):   149.3 ms … 163.4 ms    10 runs

Benchmark 2: /home/wizard/fun/zig/learn/tag/zig-out/bin/tag /home/wizard/downloads/suisei-pfp.jpg
  Time (mean ± σ):      1.894 s ±  0.074 s    [User: 0.777 s, System: 1.144 s]
  Range (min … max):    1.746 s …  2.002 s    10 runs

Summary
  /home/wizard/fun/c/things/tjpg/decode_jpeg /home/wizard/downloads/suisei-pfp.jpg ran
   12.26 ± 0.58 times faster than /home/wizard/fun/zig/learn/tag/zig-out/bin/tag /home/wizard/downloads/suisei-pfp.jpg

clang benchmarks using 17.0.6

wizard@oz ~/fun/zig/learn/tag/src $ hyperfine -r 10 "/home/wizard/fun/c/things/tjpg/clang_jpeg /home/wizard/downloads/suisei-pfp.jpg" "/home/wizard/fun/zig/learn/tag/zig-out/bin/tag /home/wizard/downloads/suisei-pfp.jpg"
Benchmark 1: /home/wizard/fun/c/things/tjpg/clang_jpeg /home/wizard/downloads/suisei-pfp.jpg
  Time (mean ± σ):     153.6 ms ±   3.6 ms    [User: 148.8 ms, System: 4.6 ms]
  Range (min … max):   147.1 ms … 157.3 ms    10 runs

Benchmark 2: /home/wizard/fun/zig/learn/tag/zig-out/bin/tag /home/wizard/downloads/suisei-pfp.jpg
  Time (mean ± σ):      1.945 s ±  0.058 s    [User: 0.742 s, System: 1.201 s]
  Range (min … max):    1.882 s …  2.015 s    10 runs

Summary
  /home/wizard/fun/c/things/tjpg/clang_jpeg /home/wizard/downloads/suisei-pfp.jpg ran
   12.66 ± 0.48 times faster than /home/wizard/fun/zig/learn/tag/zig-out/bin/tag /home/wizard/downloads/suisei-pfp.jpg
1 Like

How are you building it? What optimizations are you using?

zig build -Doptimize=ReleaseFast
gcc -o decode_jpeg main.c -lturbojpeg

1 Like

Reading the title alone, let me guess: non-buffered stdio.


Now let’s read your code and find out… yep there it is. 5 syscalls for every pixel.

Run each program with strace and notice the difference.

12 Likes

Thank you Andrew,

I’m pretty new to systems programming and I’m kinda learning all of the system call thing through Zig and I appreciate your work a lot. I don’t even do programming as a day job so this helped me a lot actually. I didn’t even know about strace before this. Adding a buffered reader and writer to my program made it faster than C on -O3.

this 3 year old article from Loris helped me understand why buffered stdio is very important in Zig.

benchmarks:

hyperfine -r 10 "/home/wizard/fun/c/things/tjpg/decode_jpeg /home/wizard/downloads/suisei-pfp.jpg" "/home/wizard/fun/zig/learn/tag/zig-out/bin/tag /home/wizard/downloads/suisei-pfp.jpg"
Benchmark 1: /home/wizard/fun/c/things/tjpg/decode_jpeg /home/wizard/downloads/suisei-pfp.jpg
  Time (mean ± σ):     152.9 ms ±   2.1 ms    [User: 145.6 ms, System: 7.2 ms]
  Range (min … max):   149.5 ms … 157.8 ms    10 runs

Benchmark 2: /home/wizard/fun/zig/learn/tag/zig-out/bin/tag /home/wizard/downloads/suisei-pfp.jpg
  Time (mean ± σ):      72.2 ms ±   1.4 ms    [User: 66.9 ms, System: 4.5 ms]
  Range (min … max):    68.9 ms …  73.8 ms    10 runs

Summary
  /home/wizard/fun/zig/learn/tag/zig-out/bin/tag /home/wizard/downloads/suisei-pfp.jpg ran
    2.12 ± 0.05 times faster than /home/wizard/fun/c/things/tjpg/decode_jpeg /home/wizard/downloads/suisei-pfp.jpg
7 Likes

Despite this. Even if my Zig program is faster. It still has the same amount of syscalls. How can I further improve this?

Before:
I’m not too sure how I got it to be 0.00

Usage: ../zig-out/bin/tag <JPEG file>
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
  0.00    0.000000           0         2           read
  0.00    0.000000           0         3           write
  0.00    0.000000           0         3           close
  0.00    0.000000           0         3           fstat
  0.00    0.000000           0        12           mmap
  0.00    0.000000           0         4           mprotect
  0.00    0.000000           0         1           munmap
  0.00    0.000000           0         1           brk
  0.00    0.000000           0         1           rt_sigaction
  0.00    0.000000           0         2           pread64
  0.00    0.000000           0         1         1 access
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           gettid
  0.00    0.000000           0         1           set_tid_address
  0.00    0.000000           0         3           openat
  0.00    0.000000           0         1           set_robust_list
  0.00    0.000000           0         3           prlimit64
  0.00    0.000000           0         1           rseq
------ ----------- ----------- --------- --------- ----------------
100.00    0.000000           0        45         1 total

After:

Usage: ../zig-out/bin/tag <JPEG file>
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 50.43    0.000291         291         1           execve
 19.76    0.000114           9        12           mmap
  7.11    0.000041          13         3           openat
  3.81    0.000022           7         3           write
  3.64    0.000021           5         4           mprotect
  2.25    0.000013           6         2           read
  2.25    0.000013           4         3           close
  2.08    0.000012          12         1           munmap
  1.91    0.000011           3         3           fstat
  1.73    0.000010           3         3           prlimit64
  1.21    0.000007           7         1         1 access
  0.87    0.000005           5         1           brk
  0.69    0.000004           2         2           pread64
  0.52    0.000003           3         1           set_tid_address
  0.35    0.000002           2         1           rt_sigaction
  0.35    0.000002           2         1           arch_prctl
  0.35    0.000002           2         1           gettid
  0.35    0.000002           2         1           set_robust_list
  0.35    0.000002           2         1           rseq
------ ----------- ----------- --------- --------- ----------------
100.00    0.000577          12        45         1 total

Where is the bottleneck for all of these syscalls? Is this just an optimization of the Zig compiler? Is it the way I’m typecasting from an integer? I know Zig prefers to be more explicit, so this leads me to think this is not necessarily true. But I have no idea. I am perfectly happy with my results but I am just curious.

const std = @import("std");
const c = @cImport({
    @cInclude("turbojpeg.h");
});
const out = std.io.getStdOut();

// const density = "N@#W$987654321!abc;:+=-,._ ";

pub fn main() !void {
    var buf = std.io.bufferedWriter(out.writer());
    const stdout = buf.writer();
    const allocator = std.heap.c_allocator;
    var arena = std.heap.ArenaAllocator.init(allocator);
    defer arena.deinit();
    const alloc = arena.allocator();

    if (std.os.argv.len != 2) {
        std.debug.print("Usage: {s} <JPEG file>\n", .{std.os.argv[0]});
        return;
    }
    var args = std.process.ArgIteratorPosix.init();
    _ = args.skip();

    const filename = args.next().?;
    var file = try std.fs.cwd().openFile(filename, .{});
    defer file.close();

    var buf_reader = std.io.bufferedReader(file.reader());
    const reader = buf_reader.reader();
    const file_size = try file.getEndPos();
    const jpeg_buf = try alloc.alloc(u8, file_size);
    defer alloc.free(jpeg_buf);

    _ = try reader.read(jpeg_buf);

    const tj_instance = c.tjInitDecompress();
    if (tj_instance == null) {
        std.debug.print("Error initializing TurboJPEG decompressor: {s}\n", .{c.tjGetErrorStr()});
        return;
    }

    var width: c_int = 0;
    var height: c_int = 0;
    var jpeg_subsamp: c_int = 0;
    var jpeg_colorspace: c_int = 0;
    const jpeg_len: c_ulong = @intCast(jpeg_buf.len);
    if (c.tjDecompressHeader3(tj_instance, jpeg_buf.ptr, jpeg_len, &width, &height, &jpeg_subsamp, &jpeg_colorspace) != 0) {
        std.debug.print("Error reading JPEG header: {s}\n", .{c.tjGetErrorStr()});
        return;
    }

    const pixel_size = 3;
    const img_buf_size: usize = @intCast(width * height * pixel_size);
    const img_buf = try alloc.alloc(u8, img_buf_size);

    if (c.tjDecompress2(tj_instance, jpeg_buf.ptr, jpeg_len, img_buf.ptr, width, 0, height, c.TJPF_RGB, 0) != 0) {
        std.debug.print("Error decompressing JPEG image: {s}\n", .{c.tjGetErrorStr()});
        return;
    }

    _ = c.tjDestroy(tj_instance);
    var i: usize = 0;
    while (i < width * height) : (i += 1) {
        const r: u32 = img_buf[i * pixel_size];
        const g: u32 = img_buf[i * pixel_size + 1];
        const b: u32 = img_buf[i * pixel_size + 2];
        const rf: f32 = @floatFromInt(r);
        const gf: f32 = @floatFromInt(g);
        const bf: f32 = @floatFromInt(b);
        const rgray: f32 = 0.299 * rf;
        const ggray: f32 = 0.587 * gf;
        const bgray: f32 = 0.114 * bf;
        const gray: u8 = @intFromFloat(rgray + ggray + bgray);
        try stdout.print("Pixel {d}: Brightness={d}\n", .{ i, gray });
        // std.debug.print("Pixel {d}: Brightness={d}\n", .{ i, gray });
        // std.debug.print("Pixel {d}: R={d}, G={d}, B={d}\n", .{i, img_buf[i * pixel_size], img_buf[i * pixel_size + 1], img_buf[i * pixel_size + 2]});
    }
    // std.debug.print("Image dimensions: {d} x {d}\n", .{ width, height });
}

Thank you, for any responses in advance.

Unless I’m mistaken, it looks like those straces are for printing the usage only since they both have

Usage: ../zig-out/bin/tag <JPEG file>

at the top.

1 Like

I’m not too sure what I’m doing wrong.

The command I am running is: strace -c <PROGRAM>

commands ran:

strace -c ../zig-out/bin/tag /home/wizard/downloads/suisei-pfp.jpg
strace -c ./decode_jpeg /home/wizard/downloads/suisei-pfp.jpg

Is this the incorrect way?

I ran it a couple times on the new iteration of the code and managed to get something that wasn’t 0.00. most of them were 0.00 though. Is there anyway to flush it?

wizard@oz ~/fun/zig/learn/tag/src $ strace -c ../zig-out/bin/tag
Usage: ../zig-out/bin/tag <JPEG file>
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 29.76    0.000025          25         1           munmap
 28.57    0.000024           8         3           write
 21.43    0.000018           6         3           prlimit64
 11.90    0.000010           2         4           mprotect
  4.76    0.000004           4         1           rt_sigaction
  3.57    0.000003           3         1           gettid
  0.00    0.000000           0         2           read
  0.00    0.000000           0         3           close
  0.00    0.000000           0         3           fstat
  0.00    0.000000           0        12           mmap
  0.00    0.000000           0         1           brk
  0.00    0.000000           0         2           pread64
  0.00    0.000000           0         1         1 access
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           set_tid_address
  0.00    0.000000           0         3           openat
  0.00    0.000000           0         1           set_robust_list
  0.00    0.000000           0         1           rseq
------ ----------- ----------- --------- --------- ----------------
100.00    0.000084           1        45         1 total

EDIT:
It is extremely late for me. I did forget the file.
Zig:

Pixel 1799937% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 99.96    1.360907         106     12778           write
  0.03    0.000460         153         3           munmap
  0.00    0.000037           0        65           read
  0.00    0.000008           0        14           mmap
  0.00    0.000004           1         4           close
  0.00    0.000000           0         4           fstat
  0.00    0.000000           0         4           mprotect
  0.00    0.000000           0         3           brk
  0.00    0.000000           0         1           rt_sigaction
  0.00    0.000000           0         2           pread64
  0.00    0.000000           0         1         1 access
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           set_tid_address
  0.00    0.000000           0         4           openat
  0.00    0.000000           0         1           set_robust_list
  0.00    0.000000           0         3           prlimit64
  0.00    0.000000           0         1           getrandom
  0.00    0.000000           0         1           rseq
------ ----------- ----------- --------- --------- ----------------
100.00    1.361416         105     12892         1 total

C:

Pixel 1799999: Brightness=73
------ ----------- ----------- --------- --------- ----------------
 99.99   11.574238           6   1800001           write
  0.01    0.000880         293         3           munmap
  0.00    0.000124          24         5           read
  0.00    0.000003           0         4           close
  0.00    0.000003           0        14           mmap
  0.00    0.000002           0         6           fstat
  0.00    0.000000           0         2           lseek
  0.00    0.000000           0         4           mprotect
  0.00    0.000000           0         3           brk
  0.00    0.000000           0         2           pread64
  0.00    0.000000           0         1         1 access
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           set_tid_address
  0.00    0.000000           0         4           openat
  0.00    0.000000           0         1           set_robust_list
  0.00    0.000000           0         1           prlimit64
  0.00    0.000000           0         1           getrandom
  0.00    0.000000           0         1           rseq
------ ----------- ----------- --------- --------- ----------------
100.00   11.575250           6   1800056         1 total

But for some reason, it is cutting off before it gets to the end of the file? Not too sure how that is happening. Before, I added the buffered reader and writer it got to the end of the file. That’s something for tomorrow I guess :slight_smile:

Need to call buf.flush() after everything has been written to ensure anything left over in the buffer gets emitted.

1 Like

Cheers! I hope my response didn’t seem angry - I was just having a bit of fun with it.

strace is a great tool to add to your repertoire - I’m glad this was an opportunity to learn about it.

There are a bunch more neat tools to learn about. Stick around and keep posting good questions with source code and you’ll gain knowledge that will help you not only with Zig but with C/C++ and other mainstream languages as well.

Happy hacking!

11 Likes

This works! I never realized what flush() did

No hard feelings :slight_smile: I watch a lot of your talks on YouTube (which I can’t wait for more!) so I knew what you were trying to get across. Thank you for replying to my post!

Yes! More Andrew K. talks please!

2 Likes

More talks, more streams, more content, please @andrewrk!

3 Likes

Thank you for the kind words, all.

To be honest I have not been streaming because I am saving up money to hire an electrician to wire an Ethernet cable up to my office. I tried streaming a couple weeks ago but the WiFi connection kept cutting out, interrupting the stream. I think I should be able to afford it next month. I also need to do some roof repairs however :grimacing:

10 Likes

2 posts were split to a new topic: Is Zig Error Prone?