Tarball viewer - chung-leong/zigar GitHub Wiki

In this exercise we're going to create a web app that allows the user to view contents stored in a GitHub tarball. It demonstrates how Zig code can read a web stream obtained through the Fetch API.

Starting out

First, we'll create a boilerplate Vite app. In a terminal window, run the following command:

npm create vite@latest

Enter code-viewer as name, then select React and JavaScript + SWC:

Need to install the following packages:
[email protected]
Ok to proceed? (y) y
✔ Project name: … code-viewer
✔ Select a framework: › React
✔ Select a variant: › JavaScript + SWC

Once the project is created, go into its directory and install the necessary files:

cd code-viewer
npm install

Next, install the Zigar plugin:

npm install --save-dev rollup-plugin-zigar

Create a sub-directory for Zig code:

mkdir zig

For demo purpose we're going to use the Ziglang repo, going with the very first release, to keep the file size small. Download https://github.com/ziglang/zig/archive/refs/tags/0.1.1.tar.gz and note the file path.

To aget feel for the standard library functions involed, we're going to first write a simple Zig program that list the contents of this tarball. Create decompress.zig in the zig sub-directory and insert the following code:

const std = @import("std");

pub fn main() !void {
    const path = "/home/cleong/Downloads/zig-0.1.1.tar.gz";
    var file = try std.fs.openFileAbsolute(path, .{});
    defer file.close();
    var decompressor = std.compress.gzip.decompressor(file.reader());
    var file_name_buffer: [std.fs.max_path_bytes]u8 = undefined;
    var link_name_buffer: [std.fs.max_path_bytes]u8 = undefined;
    var iter = std.tar.iterator(decompressor.reader(), .{
        .file_name_buffer = &file_name_buffer,
        .link_name_buffer = &link_name_buffer,
    });
    while (try iter.next()) |f| {
        std.debug.print("{s}\n", .{f.name});
    }
}

Be sure to update the hardcoded file path with your own. Then run the program using the following command (or by clicking the "run" link in VS Code):

zig run decompress.zig

You should see a long list of files:

zig-0.1.1/
zig-0.1.1/.gitignore
zig-0.1.1/.travis.yml
zig-0.1.1/CMakeLists.txt
zig-0.1.1/LICENSE
zig-0.1.1/README.md
zig-0.1.1/build.zig
zig-0.1.1/c_headers/
zig-0.1.1/c_headers/__clang_cuda_builtin_vars.h
zig-0.1.1/c_headers/__clang_cuda_cmath.h
zig-0.1.1/c_headers/__clang_cuda_complex_builtins.h
zig-0.1.1/c_headers/__clang_cuda_intrinsics.h
zig-0.1.1/c_headers/__clang_cuda_math_forward_declares.h
zig-0.1.1/c_headers/__clang_cuda_runtime_wrapper.h
...

As you can see, extracting files from a tarball is fairly straightforward in Zig. We pass the file's reader to the the gzip decompressor, then pass the decompressor's reader to std.tar.iterator(). The iterator then gives us each file.

Next, we're going to rework the code in preparation for using it in JavaScript. We want a function that accepts a std.io.AnyReader as an argument and returns an iterator of structs containing both file info and content:

const std = @import("std");

fn extract(reader: std.io.AnyReader) !Iterator {
    return .{
        .reader = reader,
    };
}

const Iterator = struct {
    const Decompressor = std.compress.gzip.Decompressor(std.io.AnyReader);
    const TarIterator = std.tar.Iterator(Decompressor.Reader);

    reader: std.io.AnyReader,
    started: bool = false,
    decompressor: Decompressor = undefined,
    file_name_buffer: [std.fs.max_path_bytes]u8 = undefined,
    link_name_buffer: [std.fs.max_path_bytes]u8 = undefined,
    tar_iter: TarIterator = undefined,

    pub fn next(self: *@This(), allocator: std.mem.Allocator) !?File {
        if (!self.started) {
            // create decompressor
            self.decompressor = std.compress.gzip.decompressor(self.reader);
            // obtain the tar iterator
            self.tar_iter = std.tar.iterator(self.decompressor.reader(), .{
                .file_name_buffer = &self.file_name_buffer,
                .link_name_buffer = &self.link_name_buffer,
            });
            self.started = true;
        }
        // get next item
        const f = try self.tar_iter.next() orelse return null;
        const reader = f.reader();
        // read data
        const data = try allocator.alloc(u8, f.size);
        errdefer allocator.free(data);
        const len = try reader.readAll(data);
        if (len != f.size) return error.SizeMismatch;
        const name = try allocator.dupe(u8, f.name);
        errdefer allocator.free(name);
        const link_name = try allocator.dupe(u8, f.link_name);
        errdefer allocator.free(link_name);
        return .{
            .name = name,
            .link_name = link_name,
            .size = f.size,
            .mode = f.mode,
            .kind = f.kind,
            .data = data,
        };
    }
};

const File = struct {
    name: []const u8,
    link_name: []const u8,
    size: u64,
    mode: u32,
    kind: std.tar.FileKind,
    data: []const u8,
};

pub fn main() !void {
    var gpa: std.heap.DebugAllocator(.{}) = .init;
    defer _ = gpa.detectLeaks();
    const allocator = gpa.allocator();
    const path = "/home/cleong/Downloads/zig-0.1.1.tar.gz";
    var file = try std.fs.openFileAbsolute(path, .{});
    defer file.close();
    var iter = try extract(file.reader().any());
    while (try iter.next(allocator)) |f| {
        defer {
            allocator.free(f.name);
            allocator.free(f.link_name);
            allocator.free(f.data);
        }
        std.debug.print("{s}\n", .{f.name});
    }
}

Most of the work is done in the iterator's next function. The iterator will get hooked up to an async generator on the JavaScript side. The allocator that next() receives will allocate JavaScript memory.

Enabling use in JavaScript

Now we'll modify the code so it works in Node.js. First, move extract() along with the structs into the namespace thread_ns:

const thread_ns = struct {
    pub fn extract(reader: std.io.AnyReader) !Iterator {
        // ...
    }

    const Iterator = struct {
        // ...
    };

    const File = struct {
        // ...
    };
};

Be sure extract() is public. Then add code that sets up a work queue:

const std = @import("std");
const zigar = @import("zigar");

var gpa: std.heap.DebugAllocator(.{}) = .init;
var work_queue: zigar.thread.WorkQueue(thread_ns) = .{};

pub fn startup() !void {
    try work_queue.init(.{ .allocator = gpa.allocator() });
}

pub fn shutdown(promise: zigar.function.Promise(void)) void {
    work_queue.deinitAsync(promise);
}

pub fn extract(
    reader: std.io.AnyReader,
    generator: zigar.function.GeneratorOf(thread_ns.extract),
) !void {
    try work_queue.push(thread_ns.extract, .{reader}, generator);
}

In the src sub-directory, create test.js:

import { open } from 'fs/promises';
import { extract, startup, shutdown } from '../zig/decompress.zig';

startup();
try {
  const file = await open('/home/cleong/Downloads/zig-0.1.1.tar.gz');
  const stream = file.readableWebStream();
  const reader = stream.getReader()
  for await (const file of extract(reader)) {
    console.log(file.name.string);
  }
} finally {
  shutdown();
}

Install node-zigar before running the script:

npm install --save-dev node-zigar

Then run it:

node --loader=node-zigar --no-warnings src/test.js

After compilation completes, the following error will pop up immediately:

Error: Inefficient reader access. Each call is only reading 5.91 bytes. Please use std.io.BufferedReader.

As the message indicates, std.compress.gzip.Decompressor only reads a handful of bytes at a time from the given reader. This results in agonizingly low throughput, since the JavaScript event loop introduces substantial latency. The iterator needs to use std.io.BufferedReader to perform read operations in larger chunks:

        const Decompressor = std.compress.gzip.Decompressor(Buffer.Reader); // <- changed
        const TarIterator = std.tar.Iterator(Decompressor.Reader);
        const Buffer = std.io.BufferedReader(1024 * 16, std.io.AnyReader); // <- added

        reader: std.io.AnyReader,
        started: bool = false,
        decompressor: Decompressor = undefined,
        buffer: Buffer = undefined, // <- added
        file_name_buffer: [std.fs.max_path_bytes]u8 = undefined,
        link_name_buffer: [std.fs.max_path_bytes]u8 = undefined,
        tar_iter: TarIterator = undefined,

        pub fn next(self: *@This(), allocator: std.mem.Allocator) !?File {
            if (!self.started) {
                // create buffered reader
                self.buffer = .{ .unbuffered_reader = self.reader }; // <- added
                // create decompressor
                self.decompressor = std.compress.gzip.decompressor(self.buffer.reader()); // <- changed

After that the script will run with no issue. Now we can test whether it works correctly with a stream from the Fetch API:

import { extract, startup, shutdown } from '../zig/decompress.zig';

startup();
try {
  const response = await fetch('https://github.com/ziglang/zig/archive/refs/tags/0.1.1.tar.gz');
  const reader = response.body.getReader()
  for await (const file of extract(reader)) {
    console.log(file.name.string);
  }
} finally {
  shutdown();
}

And it should. The file paths should fly by nearly as quickly as before if you have a decent Internet connection.

Running in web browser

After all that prelim, we've reached the moment of truth. Will our code work in the browser?

We need to first set up rollup-plugin-zigar. Open vite.config.js and add the plugin:

import react from '@vitejs/plugin-react-swc';
import zigar from 'rollup-plugin-zigar';
import { defineConfig } from 'vite';

// https://vitejs.dev/config/
export default defineConfig({
  plugins: [react(), zigar({ topLevelAwait: false, multithreaded: true, optimize: 'ReleaseSmall' })],
  server: {
    host: true,
    headers: {
      'Cross-Origin-Opener-Policy': 'same-origin',
      'Cross-Origin-Embedder-Policy': 'require-corp',
    }
  },
})

We need to configure Vite's dev server to add two HTTP headers to responses to make our app cross origin isolated. This is a prerequisite of using WebAssembly threads.

After changing the configuration, open src/App.jsx and add test.js as an import:

import './App.css';
import './test.js';

We can then start Vite in dev mode:

npm run dev

We'll run straight into a Zig compilation error:

/home/cleong/zigar/rollup-plugin-zigar/demos/vite/code-viewer/zig/decompress.zig:59:51: 
error: expected type 'usize', found 'u64'

We had neglected to cast u64 to usize. On a 64-bit OS usize is 64-bit so there was no problem. WebAssembly is 32-bit, so the compiler balked. We can fix this easily enough:

            // read data
            const correct_len: usize = @intCast(f.size);
            const data = try allocator.alloc(u8, correct_len);
            errdefer allocator.free(data);
            const len = try reader.readAll(data);
            if (len != correct_len) return error.SizeMismatch;

After restarting Vite, our code will compile correctly, but immediately we encounter a more serious issue:

Dev console

GitHub's servers don't allow cross-origin access. Our web app simply cannot retrieve the tarball. Not directly. CORS proxy services like as https://corsproxy.io/ do exist luckily. As soon as we insert the proxy's URL into our URL:

  const response = await fetch('https://corsproxy.io/?url=https://github.com/ziglang/zig/archive/refs/tags/0.1.1.tar.gz');

Bingo! We have file paths racing by in the dev console:

Dev console

Now it's just a matter of creating a user interface. First open src/main.jsx and remove the <StrictMode> tags:

import { createRoot } from 'react-dom/client'
import App from './App.jsx'
import './index.css'

createRoot(document.getElementById('root')).render(
  <App />
)

This stops React from initializing the app twice in dev mode. After that, install a file tree component and another for syntax highlighting:

npm install @sinm/react-file-tree react-syntax-highlighter

Since this isn't a React tutorial, I'm not going to get in details of how the code works. I'll simply provide the updated files, starting with App.jsx:

import { FileTree, utils } from '@sinm/react-file-tree';
import '@sinm/react-file-tree/icons.css';
import FileItemWithFileIcon from '@sinm/react-file-tree/lib/FileItemWithFileIcon';
import '@sinm/react-file-tree/styles.css';
import { startTransition, useCallback, useEffect, useState } from 'react';
import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter';
import { dark } from 'react-syntax-highlighter/dist/esm/styles/prism';
import { extract, shutdown, startup } from '../zig/decompress.zig';
import './App.css';

const decoder = new TextDecoder;
const ext2lang = {
  h: 'c',
  c: 'c',
  hpp: 'cpp',
  cpp: 'cpp',
  js: 'javascript',
};

function App() {
  const [ tree, setTree ] = useState(null);
  const [ codeString, setCodeString ] = useState('');
  const [ language, setLanguage ] = useState('');
  const treeProps = {
    tree,
    itemRenderer: (treeNode) => <FileItemWithFileIcon treeNode={treeNode} />,
    onItemClick: useCallback(({ uri, type, data, expanded }) => {
      startTransition(() => {
        if (type === 'directory') {
          setTree(tree => utils.assignTreeNode(tree, uri, { expanded: !expanded }));
        } else {
          const code = decoder.decode(data);
          const slashIndex = uri.lastIndexOf('/');
          const dotIndex = uri.lastIndexOf('.');
          const ext = (dotIndex > slashIndex) ? uri.slice(dotIndex + 1) : '';
          setCodeString(code);
          setLanguage(ext2lang[ext] ?? ext);
        }
      });
    }, []),
  };
  const highlightProps = {
    language,
    showLineNumbers: true,
    style: dark,
    customStyle: {
      backgroundColor: null,
      border: null,
      boxShadow: null,
      padding: null,
      margin: 0,
    },
  };
  useEffect(() => {
    let unmounted = false;    
    async function load() {
      startup();
      const response = await fetch('https://corsproxy.io/?url=https://github.com/ziglang/zig/archive/refs/tags/0.1.1.tar.gz');
      const reader = response.body.getReader()
      try {
        for await (const file of await extract(reader)) {
          if (unmounted) break;
          startTransition(() => {
            setTree((tree) => {
              const uri = file.name.string.replace(/\/$/, '');
              const slashIndex = uri.lastIndexOf('/');
              if (slashIndex === -1) {
                return { uri, expanded: true };
              } else {
                const parentUri = uri.slice(0, slashIndex);
                const node = (file.kind == 'directory') 
                ? { uri, type: 'directory', expanded: false }
                : { uri, type: 'file', data: file.data.typedArray };
                return utils.appendTreeNode(tree, parentUri, node);
              }
            });
          });
        }
      } finally {
        await shutdown();
      }
    }
    load();
    return () => unmounted = true;
  }, []);
  return (
    <>
      <div id="left-pane">
        <FileTree {...treeProps} />
      </div>
      <div id="right-pane">
        <SyntaxHighlighter {...highlightProps}>
          {codeString}
        </SyntaxHighlighter>
      </div>
    </>
  )
}

export default App

Then App.css:

#root {
  margin: 0 auto;
  display: flex;
  flex-direction: row;
  height: 100vh;
}

#left-pane {
  flex: 1 0 16em;
  display: flex;
  border-right: 1px solid #333333;
  padding-left: 0.5em;
}

#left-pane span {
  cursor: pointer;
}

#right-pane {
  flex: 10 1 auto;
  display: flex;
  overflow: auto;
}

#right-pane pre {
  flex: 1 1 100%;
}

And finally index.css:

:root {
  font-family: system-ui, Avenir, Helvetica, Arial, sans-serif;
  line-height: 1.5;
  font-weight: 400;

  color-scheme: light dark;
  color: rgba(255, 255, 255, 0.87);
  background-color: #242424;

  font-synthesis: none;
  text-rendering: optimizeLegibility;
  -webkit-font-smoothing: antialiased;
  -moz-osx-font-smoothing: grayscale;
}

body {
  margin: 0;
  min-width: 320px;
  min-height: 100vh;
}

One thing worth mentioning is why an await has been added in front of the call to extract(). This is necessary because we've disabled the use of top-level await for the sake of older browsers. As a result, it's possible for WebAssembly compilation to be unfinished by the time of the call. In that case extract() would return a promise (instead of an async generator).

Here's how our app looks with the UI in place:

User interface

Source code

You can find the complete source code for this example here.

You can see the code in action here. As corsproxy.io's free tier only works at localhost, this live demo does not actually download the tarball from GitHub, relying instead on a copy hosted at Cloudflare.