Output zig1 executable faster using object files and multi-threaded

Creating zig1 requires both zig1.c (Over 200 MB) and stage1/wasi.c.

The C compiler then outputs zig1, but this is singlethreaded.

I made zig1-wasm2c output object files using the following patch:

diff --git a/stage1/wasm2c.c b/stage1/wasm2c.c
index 425cc682b8..c6ffe36f47 100644
--- a/stage1/wasm2c.c
+++ b/stage1/wasm2c.c
@@ -76,13 +76,15 @@ static void renderExpr(FILE *out, struct InputStream *in) {
 static const uint32_t big_endian = 0xff000000;
 
 int main(int argc, char **argv) {
-    if (argc != 3 && argc != 4) {
-        fprintf(stderr, "usage: %s <in.wasm.zst> <out.c> [endian]\n", argv[0]);
+    if (argc != 3 && argc != 4 && argc != 5) {
+        fprintf(stderr, "usage: %s <in.wasm.zst> <out.c> [endian] [Output this many C files]\n", argv[0]);
         return 1;
     }
 
     bool is_big_endian;
 
+    uint32_t num_c_files = 1;
+
     if (argc >= 4) {
         if (!strcmp(argv[3], "big")) {
             is_big_endian = true;
@@ -96,6 +98,12 @@ int main(int argc, char **argv) {
         is_big_endian = *(uint8_t *)&big_endian; // Infer from host endianness.
     }
 
+    if (argc >= 5)
+        num_c_files = strtoul(argv[4], NULL, 0);
+
+    if (num_c_files == ULONG_MAX || num_c_files == 0)
+        panic("strtoul failed");
+
     const char *mod = "wasm";
 
     struct InputStream in;
@@ -107,7 +115,14 @@ int main(int argc, char **argv) {
         InputStream_readByte(&in) != 'm') panic("input is not a zstd-compressed wasm file");
     if (InputStream_readLittle_u32(&in) != 1) panic("unsupported wasm version");
 
-    FILE *out = fopen(argv[2], "wb");
+    FILE *out;
+    if (num_c_files == 1) {
+        out = fopen(argv[2], "wb");
+    } else {
+        out = fopen("zig1.h", "wb");
+    }
+    FILE *tmp_out;
+    FILE **num_c_files_stream;
     if (out == NULL) panic("unable to open output file");
     fputs("#include <float.h>\n"
           "#include <math.h>\n"
@@ -413,7 +428,11 @@ int main(int argc, char **argv) {
         for (uint32_t i = 0; i < len; i += 1) {
             funcs[i].type_idx = InputStream_readLeb128_u32(&in);
             const struct FuncType *func_type = &types[funcs[i].type_idx];
-            fputs("static ", out);
+            if (num_c_files == 1) {
+                fputs("static ", out);
+            } else {
+                fputs("extern ", out);
+            }
             switch (func_type->result->len) {
                 case 0: fputs("void", out); break;
                 case 1: fputs(WasmValType_toC(func_type->result->types[0]), out); break;
@@ -572,12 +591,32 @@ int main(int argc, char **argv) {
         uint32_t *param_stash = malloc(sizeof(uint32_t) * max_param_len);
 
         uint32_t len = InputStream_readLeb128_u32(&in);
+        char tmpstrbuffer[64];
+        if (num_c_files > len)
+            num_c_files = len;
+        if (num_c_files > 1){
+            num_c_files_stream = malloc(sizeof(FILE *) * num_c_files);
+            if (num_c_files_stream == NULL)
+                panic("out of memory");
+            for (uint32_t n = 0; n < num_c_files; ++n) {
+                sprintf(tmpstrbuffer, "zig1_%" PRIu32 ".c", n);
+                num_c_files_stream[n] = fopen(tmpstrbuffer, "wb");
+                if (num_c_files_stream[n] == NULL)
+                    panic("unable to open file");
+                fputs("#include \"zig1.h\"\n\n", num_c_files_stream[n]);
+            }
+        }
+        tmp_out = out;
         for (uint32_t func_i = 0; func_i < len; func_i += 1) {
+            if (num_c_files > 1)
+                out = num_c_files_stream[func_i % num_c_files];
+
             FuncGen_reset(&fg);
 
             InputStream_readLeb128_u32(&in);
             const struct FuncType *func_type = &types[funcs[func_i].type_idx];
-            fputs("static ", out);
+            if (num_c_files == 1)
+                fputs("static ", out);
             switch (func_type->result->len) {
                 case 0: fputs("void", out); break;
                 case 1: fputs(WasmValType_toC(func_type->result->types[0]), out); break;
@@ -2253,6 +2292,7 @@ int main(int argc, char **argv) {
             }
             fputs("}\n\n", out);
         }
+        out = tmp_out;
     }
 
     (void)InputStream_skipToSection(&in, WasmSectionId_data);

This outputs a zig1.h and zig1_N.c files. The zig1_N.c files contains zig1.h and some fN() functions.

Run the command

./zig-wasm2c ../stage1/zig1.wasm zig1.c little <N>

to get <N> c files.

Then get the object files:

find . -type f -name "zig1_*.c" -print0 | xargs -0 -P <N> -I {} gcc {} -c -Os -lm -std=c99 ;
gcc ../stage1/wasi.c -c -Os -lm -std=c99

where <N> is N threads.

When i try to link however:

gcc $(ls zig1_*.o) wasi.o -o zig1_mt.o -Os -lm -std=c99

I get a multiple definition error:

zig1_7.c:(.text+0x26bc9): multiple definition of `load16_align0'; zig1_0.o:zig1_0.c:(.text+0x26b91): first defined here

How do i process from here?

1 Like

Welcome to the forum! That sounds like an interesting idea. The load16_align0 function (along with several others) is coming from this code, which in your patch you are outputting to zig1.h: zig/stage1/wasm2c.c at 37037b269e4484702d12a3c8c909d4b8ed5895f1 · ziglang/zig · GitHub The issue is that some of these functions, including load16_align0, are not static, and they’re being duplicated across all the zig1_N.c files, each of which includes zig1.h. That’s why the linker is complaining that those symbols are being duplicated.

Those functions are also used by wasi.c: zig/stage1/wasi.c at 37037b269e4484702d12a3c8c909d4b8ed5895f1 · ziglang/zig · GitHub so you may have to make additional changes to account for that as well, depending on how you choose to resolve this issue.

I refactor the code to now include two header files: zig1.h and wasi_gen.h.

get-zig1-faster.patch

zig1.h only contains extern functions and globals, while wasi_gen.h contains the function definitons.

I compared the performances of building zig1 and zig1 itself using 8 threads.

All shell scripts code:

zig1-st.sh

#!/bin/bash
./zig-wasm2c stage1/zig1.wasm zig1.c
gcc -Os -lm -std=c99 stage1/wasi.c zig1.c -o zig1-st

zig1-mt.sh

#!/bin/bash
./zig-wasm2c stage1/zig1.wasm zig1.c little 8
cc stage1/wasi.c -c -DZIG1_WASI_INCLUDE_HEADER="wasi_gen.h" -Os -std=c99
find . -type f -name "zig1_*.c" -print0 | xargs -0 -P 8 -I {} gcc {} -c -Os -std=c99 ;
cc $(ls zig1_*.o) wasi.o -o zig1-mt -Os -lm

./zig1-mt-lto.sh

#!/bin/bash
./zig-wasm2c stage1/zig1.wasm zig1.c little 8
cc stage1/wasi.c -c -DZIG1_WASI_INCLUDE_HEADER="wasi_gen.h" -Os -std=c99 -flto=8
find . -type f -name "zig1_*.c" -print0 | xargs -0 -P 8 -I {} gcc {} -c -Os -std=c99 -flto=8;
cc $(ls zig1_*.o) wasi.o -o zig1-mt-lto -Os -lm -flto=8

LTO is the fastest:

./x86_64-linux-poop './zig1-st.sh' './zig1-mt.sh' './zig1-mt-lto.sh'
Benchmark 1 (3 runs): ./zig1-st.sh
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           114s  ± 6.16s      110s  …  121s           0 ( 0%)        0%
  peak_rss           2.12GB ±  206KB    2.12GB … 2.12GB          0 ( 0%)        0%
  cpu_cycles          385G  ± 9.88G      379G  …  396G           0 ( 0%)        0%
  instructions        680G  ± 43.1M      680G  …  680G           0 ( 0%)        0%
  cache_references   9.26G  ±  113M     9.19G  … 9.39G           0 ( 0%)        0%
  cache_misses       2.17G  ±  130M     2.09G  … 2.32G           0 ( 0%)        0%
  branch_misses      2.01G  ±  727K     2.01G  … 2.01G           0 ( 0%)        0%
Benchmark 2 (3 runs): ./zig1-mt.sh
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           112s  ± 29.0ms     112s  …  112s           0 ( 0%)          -  1.8% ±  8.7%
  peak_rss           1.07GB ±  331KB    1.07GB … 1.07GB          0 ( 0%)        ⚡- 49.2% ±  0.0%
  cpu_cycles         1.70T  ± 14.3G     1.69T  … 1.72T           0 ( 0%)        💩+342.2% ±  7.2%
  instructions       1.44T  ± 24.9M     1.44T  … 1.44T           0 ( 0%)        💩+111.6% ±  0.0%
  cache_references   23.6G  ±  115M     23.5G  … 23.7G           0 ( 0%)        💩+154.8% ±  2.8%
  cache_misses       10.4G  ± 28.1M     10.4G  … 10.4G           0 ( 0%)        💩+379.8% ±  9.8%
  branch_misses      4.44G  ± 10.6M     4.43G  … 4.45G           0 ( 0%)        💩+121.1% ±  0.8%
Benchmark 3 (3 runs): ./zig1-mt-lto.sh
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          71.5s  ± 90.4ms    71.4s  … 71.6s           0 ( 0%)        ⚡- 37.1% ±  8.7%
  peak_rss            799MB ±  391KB     799MB …  800MB          0 ( 0%)        ⚡- 62.2% ±  0.0%
  cpu_cycles         1.10T  ± 6.14G     1.09T  … 1.10T           0 ( 0%)        💩+185.5% ±  4.8%
  instructions        922G  ± 19.4M      922G  …  922G           0 ( 0%)        💩+ 35.6% ±  0.0%
  cache_references   14.5G  ± 29.1M     14.5G  … 14.5G           0 ( 0%)        💩+ 56.4% ±  2.0%
  cache_misses       5.98G  ± 8.63M     5.97G  … 5.99G           0 ( 0%)        💩+175.9% ±  9.6%
  branch_misses      2.91G  ± 2.60M     2.91G  … 2.92G           0 ( 0%)        💩+ 45.0% ±  0.2%

I modified bootstrap.c to include these new zig1 binaries:

diff --git a/bootstrap.c b/bootstrap.c
index a37834f463..3b9542dea1 100644
--- a/bootstrap.c
+++ b/bootstrap.c
@@ -100,12 +100,14 @@ int main(int argc, char **argv) {
     const char *cc = get_c_compiler();
     const char *host_triple = get_host_triple();
 
+#ifndef GOTO_ZIG2
     {
         const char *child_argv[] = {
             cc, "-o", "zig-wasm2c", "stage1/wasm2c.c", "-O2", "-std=c99", NULL,
         };
         print_and_run(child_argv);
     }
+#ifndef ZIG1_FASTER
     {
         const char *child_argv[] = {
             "./zig-wasm2c", "stage1/zig1.wasm", "zig1.c", NULL,
@@ -118,6 +120,15 @@ int main(int argc, char **argv) {
         };
         print_and_run(child_argv);
     }
+#else
+    {
+        const char *child_argv[] = {
+            "./ZIG1.sh", NULL,
+        };
+       print_and_run(child_argv);
+    }
+#endif
+#endif
     {
         FILE *f = fopen("config.zig", "wb");
         if (f == NULL)
@@ -151,7 +162,7 @@ int main(int argc, char **argv) {
 
     {
         const char *child_argv[] = {
-            "./zig1", "lib", "build-exe",
+            argv[1], "lib", "build-exe",
             "-ofmt=c", "-lc", "-OReleaseSmall",
             "--name", "zig2", "-femit-bin=zig2.c",
             "-target", host_triple,
@@ -167,7 +178,7 @@ int main(int argc, char **argv) {
 
     {
         const char *child_argv[] = {
-            "./zig1", "lib", "build-obj",
+            argv[1], "lib", "build-obj",
             "-ofmt=c", "-OReleaseSmall",
             "--name", "compiler_rt", "-femit-bin=compiler_rt.c",
             "-target", host_triple,

Get the new bootstrap.c:

gcc bootstrap.c -O2 -DGOTO_ZIG2 -o bootstrap-zig2

The multithreaded binaries turn out to be slower and larger:

./x86_64-linux-poop './bootstrap-zig2 ./zig1-st' './bootstrap-zig2 ./zig1-mt' './bootstrap-zig2 ./zig1-mt-lto'
Benchmark 1 (3 runs): ./bootstrap-zig2 ./zig1-st
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           413s  ± 9.98s      403s  …  423s           0 ( 0%)        0%
  peak_rss           5.37GB ±  509KB    5.37GB … 5.37GB          0 ( 0%)        0%
  cpu_cycles         1.39T  ± 13.5G     1.38T  … 1.40T           0 ( 0%)        0%
  instructions       2.29T  ±  125M     2.29T  … 2.29T           0 ( 0%)        0%
  cache_references   29.2G  ±  140M     29.1G  … 29.4G           0 ( 0%)        0%
  cache_misses       7.08G  ±  138M     6.92G  … 7.18G           0 ( 0%)        0%
  branch_misses      8.91G  ± 3.15M     8.91G  … 8.91G           0 ( 0%)        0%
Benchmark 2 (3 runs): ./bootstrap-zig2 ./zig1-mt
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           458s  ± 5.69s      452s  …  462s           0 ( 0%)        💩+ 10.8% ±  4.5%
  peak_rss           5.37GB ± 99.1KB    5.37GB … 5.37GB          0 ( 0%)          +  0.0% ±  0.0%
  cpu_cycles         1.52T  ± 2.82G     1.52T  … 1.52T           0 ( 0%)        💩+  9.5% ±  1.6%
  instructions       2.47T  ± 61.8M     2.47T  … 2.47T           0 ( 0%)        💩+  7.8% ±  0.0%
  cache_references   29.6G  ± 67.3M     29.6G  … 29.7G           0 ( 0%)          +  1.2% ±  0.9%
  cache_misses       6.96G  ± 21.8M     6.93G  … 6.97G           0 ( 0%)          -  1.7% ±  3.2%
  branch_misses      8.97G  ± 5.35M     8.97G  … 8.98G           0 ( 0%)          +  0.7% ±  0.1%
Benchmark 3 (3 runs): ./bootstrap-zig2 ./zig1-mt-lto
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           451s  ± 1.34s      450s  …  453s           0 ( 0%)        💩+  9.2% ±  3.9%
  peak_rss           5.37GB ±  377KB    5.37GB … 5.37GB          0 ( 0%)          -  0.0% ±  0.0%
  cpu_cycles         1.54T  ± 1.44G     1.54T  … 1.54T           0 ( 0%)        💩+ 10.7% ±  1.6%
  instructions       2.50T  ±  116M     2.50T  … 2.50T           0 ( 0%)        💩+  9.1% ±  0.0%
  cache_references   29.6G  ± 39.0M     29.6G  … 29.6G           0 ( 0%)          +  1.3% ±  0.8%
  cache_misses       6.97G  ± 13.5M     6.96G  … 6.99G           0 ( 0%)          -  1.5% ±  3.1%
  branch_misses      8.96G  ± 1.52M     8.96G  … 8.96G           0 ( 0%)          +  0.6% ±  0.1%

This was compiled with gcc (Gentoo 14.2.1_p20241221 p7) 14.2.1 20241221 and GNU ld (Gentoo 2.43 p3) 2.43.1.