Writing a Type-Safe Linux Perf Interface in Zig

I am currently building a hobby project: pyk/bench, a microbenchmarking library for Zig. My goal is to make it fast and accurate. To measure performance properly, looking at wall clock time is not enough. I need to know what the CPU is actually doing.

I want to measure CPU cycles, instruction counts and cache misses. On Linux the kernel provides a system call for this named perf_event_open. It is very powerful but the API is raw and not easy to use safely.

Understanding perf_event_open

The perf_event_open system call creates a file descriptor that allows measuring performance information. You fill out a perf_event_attr struct with the config you want, such as PERF_COUNT_HW_CPU_CYCLES or PERF_COUNT_HW_INSTRUCTIONS, and the kernel gives you back a file descriptor.

You can read from this file descriptor to get the counts. The format of the data you read depends on how you opened it.

You can also group events. This is important because it lets you measure multiple things at once with a single read call. One event acts as the “group leader” and others are “siblings”. When you read from the leader, you get a binary layout containing values for all events in the group.

The layout looks roughly like this in C:

C
struct read_format {
    u64 nr;            /* The number of events */
    u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
    u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
    struct {
        u64 value;     /* The value of the event */
        u64 id;        /* if PERF_FORMAT_ID */
    } values[nr];
};

This is dynamic. The size of the struct changes based on how many events you have. In Zig I want something static and type-safe.

The Non-Type-Safe Approach

My first attempt was brittle. I hardcoded a struct with the fields I thought I would need.

Zig
// References: https://man7.org/linux/man-pages/man2/perf_event_open.2.html

const std = @import("std");
const builtin = @import("builtin");
const linux = std.os.linux;
const posix = std.posix; // TODO: remove this, ref https://github.com/ziglang/zig/issues/6600

const Perf = @This();
const PERF_EVENT_IOC_ID = linux.IOCTL.IOR('$', 7, u64);

leader_fd: posix.fd_t = -1,
sibling_fds: [2]posix.fd_t = .{ -1, -1 },

/// IDs assigned by the kernel to identify events in the read buffer.
/// Indices: 0=Cycles, 1=Instructions, 2=CacheMisses
ids: [3]u64 = .{ 0, 0, 0 },

pub const Measurements = struct {
    cycles: u64,
    instructions: u64,
    cache_misses: u64,
};

pub fn init() !Perf {
    var self = Perf{};

    // CPU Cycles (Group Leader)
    self.leader_fd = try openEvent(.cpu_cycles, -1);
    self.ids[0] = try getId(self.leader_fd);

    {
        const fd = try openEvent(.instructions, self.leader_fd);
        self.ids[1] = try getId(fd);
        self.sibling_fds[0] = fd;
    }

    {
        const fd = try openEvent(.cache_misses, self.leader_fd);
        self.ids[2] = try getId(fd);
        self.sibling_fds[1] = fd;
    }

    return self;
}

pub fn deinit(self: *Perf) void {
    if (self.leader_fd != -1) {
        _ = linux.close(self.leader_fd);
        self.leader_fd = -1;
    }
    for (self.sibling_fds, 0..) |fd, i| {
        if (fd != -1) _ = linux.close(fd);
        self.sibling_fds[i] = -1;
    }
}

pub fn capture(self: *Perf) !void {
    if (self.leader_fd == -1) return;
    const reset = linux.ioctl(self.leader_fd, linux.PERF.EVENT_IOC.RESET, 0);
    if (std.c.errno(reset) != .SUCCESS) @panic("ioctl/reset fails");
    const enable = linux.ioctl(self.leader_fd, linux.PERF.EVENT_IOC.ENABLE, 0);
    if (std.c.errno(enable) != .SUCCESS) @panic("ioctl/enable fails");
}

pub fn stop(self: *Perf) !void {
    if (self.leader_fd == -1) return;
    const disable = linux.ioctl(self.leader_fd, linux.PERF.EVENT_IOC.DISABLE, 0);
    if (std.c.errno(disable) != .SUCCESS) @panic("ioctl/disable fails");
}

/// Reads the counter values.
/// Returns a struct with the collected data.
pub fn read(self: *Perf) !Measurements {
    var m = Measurements{
        .cycles = 0,
        .instructions = 0,
        .cache_misses = 0,
    };
    if (self.leader_fd == -1) return m;

    // Format: PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID | PERF_FORMAT_GROUP
    // Layout: nr, time_enabled, time_running, [value, id], [value, id], ...
    // Max items = 3. Header = 3 u64. Total u64s = 3 + (2 * 3) = 9
    var buf: [16]u64 = undefined;

    _ = try posix.read(self.leader_fd, std.mem.sliceAsBytes(&buf));

    const nr = buf[0];
    const time_enabled = buf[1];
    const time_running = buf[2];

    // std.debug.print("nr={d}\n", .{nr});
    // std.debug.print("time_running={d}\n", .{time_running});

    if (time_running == 0) return m;

    var i: usize = 0;
    while (i < nr) : (i += 1) {
        const base_idx = 3 + (i * 2);
        if (base_idx + 1 >= buf.len) break;

        var val = buf[base_idx];
        const id = buf[base_idx + 1];

        // std.debug.print("i={d} val={d} (before)\n", .{ i, val });
        if (time_running < time_enabled) {
            val = @as(u64, @intFromFloat(@as(f64, @floatFromInt(val)) * (@as(f64, @floatFromInt(time_enabled)) / @as(f64, @floatFromInt(time_running)))));
        }

        // std.debug.print("i={d} val={d} (after)\n", .{ i, val });
        // std.debug.print("i={d} id={d}\n", .{ i, id });

        if (id == self.ids[0]) m.cycles = val;
        if (id == self.ids[1]) m.instructions = val;
        if (id == self.ids[2]) m.cache_misses = val;
    }

    return m;
}

const Event = enum { cpu_cycles, instructions, cache_misses };

fn openEvent(event: Event, group_fd: posix.fd_t) !posix.fd_t {
    const config: u64 = switch (event) {
        .cpu_cycles => @intFromEnum(linux.PERF.COUNT.HW.CPU_CYCLES),
        .instructions => @intFromEnum(linux.PERF.COUNT.HW.INSTRUCTIONS),
        .cache_misses => @intFromEnum(linux.PERF.COUNT.HW.CACHE_MISSES),
    };

    var attr = std.mem.zeroes(linux.perf_event_attr);
    attr.type = linux.PERF.TYPE.HARDWARE;
    attr.config = config;

    // Enable grouping and ID tracking
    attr.read_format = 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3;

    attr.flags.disabled = (group_fd == -1); // Only leader starts disabled
    attr.flags.inherit = true;
    attr.flags.exclude_kernel = true;
    attr.flags.exclude_hv = true;

    const fd = try posix.perf_event_open(&attr, 0, -1, group_fd, 0);
    return fd;
}

fn getId(fd: i32) !u64 {
    var id: u64 = 0;
    if (linux.ioctl(fd, PERF_EVENT_IOC_ID, @intFromPtr(&id)) != 0) {
        return error.IoctlFailed;
    }
    return id;
}

This works but it is dangerous. The Measurements struct is hardcoded.

Zig
pub const Measurements = struct {
    cycles: u64,
    instructions: u64,
    cache_misses: u64,
};

If I change the initialization logic to add “branch misses”, I have to remember to update Measurements, update the ids array size and update the read function manually. The compiler cannot help me here.

If I access ids[2] but only initialized 2 events, I crash or get garbage data.

The Type-Safe Approach using comptime

Zig allows running code at compile time to generate types. I can use this to generate a struct that exactly matches the events I request.

I defined an Event enum for the things I want to measure.

Zig
pub const Event = enum {
    cpu_cycles,
    instructions,
    cache_misses,
    branch_misses,
    bus_cycles,

    pub fn toConfig(self: Event) u64 {
        // map to kernel constants
    }
};

Then I wrote a function that takes a slice of these events and returns a new type.

Zig
pub fn GroupReadOutputType(comptime events: []const Event) type {
    var field_names: [events.len][]const u8 = undefined;
    var field_types: [events.len]type = undefined;
    var field_attrs: [events.len]Type.StructField.Attributes = undefined;

    for (events, 0..) |event, index| {
        field_names[index] = @tagName(event);
        field_types[index] = u64;
        field_attrs[index] = .{
            .@"comptime" = false,
            .@"align" = @alignOf(u64),
            .default_value_ptr = null,
        };
    }
    return @Struct(
        .auto,
        null,
        &field_names,
        &field_types,
        &field_attrs,
    );
}

This function creates a struct with fields named after the enum tags. If I pass &.{ .cpu_cycles, .instructions }, it generates:

Zig
struct {
    cpu_cycles: u64,
    instructions: u64,
}

Now I can create a generic Group type that uses this.

Zig
pub fn Group(comptime events: []const Event) type {
    const Output = GroupReadOutputType(events);

    return struct {
        const Self = @This();
        event_ids: [events.len]u64 = undefined,

        // ... init logic ...

        pub fn read(self: *Self) Error!Output {
            var output: Output = std.mem.zeroes(Output);

            // ... read binary data from kernel ...

            for (data.values) |item| {
                // Map the kernel ID back to our event tags
                inline for (events, 0..) |tag, i| {
                    if (item.id == self.event_ids[i]) {
                        @field(output, @tagName(tag)) = item.value;
                    }
                }
            }
            return output;
        }
    };
}

The usage is cleaner and safer.

Zig
const events = [_]perf.Event{ .cpu_cycles, .instructions };
const perf_group = perf.Group(&events);
var group = try perf_group.init();

// ... run code ...

const m = try group.read();
std.debug.print("Cycles: {d}\n", .{m.cpu_cycles});
// std.debug.print("{d}", .{m.cache_misses}); // Compile Error! field does not exist

If I try to access a field I did not request, the compiler stops me.

Here is how the full version looks like:

src/perf.zig
const std = @import("std");
const linux = std.os.linux;
const Type = std.builtin.Type;

// Bits for perf_event_attr.read_format
const PERF_FORMAT_TOTAL_TIME_ENABLED = 1 << 0;
const PERF_FORMAT_TOTAL_TIME_RUNNING = 1 << 1;
const PERF_FORMAT_ID = 1 << 2;
const PERF_FORMAT_GROUP = 1 << 3;

// Various ioctls act on perf_event_open() file descriptors:
const PERF_EVENT_IOC_ID = linux.IOCTL.IOR('$', 7, u64);
const PERF_EVENT_IOC_RESET = linux.PERF.EVENT_IOC.RESET;
const PERF_EVENT_IOC_ENABLE = linux.PERF.EVENT_IOC.ENABLE;
const PERF_EVENT_IOC_DISABLE = linux.PERF.EVENT_IOC.DISABLE;

/// The hardware events supported by the kernel for performance monitoring.
/// These map directly to `perf_event_attr.config` values.
pub const Event = enum {
    cpu_cycles,
    instructions,
    cache_misses,
    branch_misses,
    bus_cycles,

    /// Converts the enum into the specific kernel configuration integer
    /// required by the `perf_event_open` syscall.
    pub fn toConfig(self: Event) u64 {
        return switch (self) {
            .cpu_cycles => @intFromEnum(linux.PERF.COUNT.HW.CPU_CYCLES),
            .instructions => @intFromEnum(linux.PERF.COUNT.HW.INSTRUCTIONS),
            .cache_misses => @intFromEnum(linux.PERF.COUNT.HW.CACHE_MISSES),
            .branch_misses => @intFromEnum(linux.PERF.COUNT.HW.BRANCH_MISSES),
            .bus_cycles => @intFromEnum(linux.PERF.COUNT.HW.BUS_CYCLES),
        };
    }
};

pub fn GroupReadOutputType(comptime events: []const Event) type {
    var field_names: [events.len][]const u8 = undefined;
    var field_types: [events.len]type = undefined;
    var field_attrs: [events.len]Type.StructField.Attributes = undefined;
    for (events, 0..) |event, index| {
        field_names[index] = @tagName(event);
        field_types[index] = u64;
        field_attrs[index] = .{
            .@"comptime" = false,
            .@"align" = @alignOf(u64),
            .default_value_ptr = null,
        };
    }
    return @Struct(
        .auto,
        null,
        &field_names,
        &field_types,
        &field_attrs,
    );
}

/// A type-safe wrapper for the Linux `perf_event_open` system call,
/// specifically configured for event grouping (`PERF_FORMAT_GROUP`).
///
/// `Group` leverages Zig's `comptime` features to generate a custom
/// `ReadOutputType` result type that strictly matches the requested `events`.
/// It manages the complexity of creating a group leader, attaching sibling
/// events, and handling the binary layout of the kernel's read buffer.
///
/// Notes:
/// * The `read()` method returns a struct with named fields corresponding
///   exactly to the input events (e.g. `.cpu_cycles`).
/// * The `read()` method automatically detects if the CPU was oversubscribed
///   and scales the counter values based on `time_enabled` and `time_running`.
///
/// References:
/// * man 2 perf_event_open
/// * man 1 perf-list
pub fn Group(comptime events: []const Event) type {
    if (events.len == 0) @compileError("perf.Group requires at least 1 event");

    const Error = error{
        /// Failed to open group via perf_event_open
        OpenGroupFailed,
        /// Failed to retrieve the ID of the event via IOCTL
        GetIdFailed,
        /// Failed to reset counters via IOCTL
        ResetGroupFailed,
        /// Failed to enable counters via IOCTL
        EnableGroupFailed,
        /// Failed to disable counters via IOCTL
        DisableGroupFailed,
        /// Failed to read data from the file descriptor
        ReadGroupFailed,
        /// Group already deinitialized
        BadGroup,
    };

    const Output = GroupReadOutputType(events);

    // Matches the binary layout of the buffer read from the group leader fd.
    // See `man perf_event_open` section "Reading results".
    // Corresponds to `struct read_format` when using:
    // PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED |
    // PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID
    const ReadFormatGroup = extern struct {
        /// The number of events in this group.
        nr: u64,
        /// Total time the event group was enabled.
        time_enabled: u64,
        /// Total time the event group was actually running.
        time_running: u64,
        /// Array of values matching the `nr` of events.
        values: [events.len]extern struct {
            value: u64,
            id: u64,
        },
    };

    return struct {
        const Self = @This();

        event_fds: [events.len]linux.fd_t = undefined,
        event_ids: [events.len]u64 = undefined,

        /// Initializes the performance monitoring group.
        ///
        /// This opens a file descriptor for every event in the `events` list.
        /// The first event becomes the group leader. All subsequent events
        /// are created as siblings pinned to the leader.
        ///
        /// The counters start in a disabled state. You must call `enable()`
        /// to begin counting.
        ///
        /// **Note:** The caller owns the returned group and must call `deinit`
        /// to close the file descriptors.
        pub fn init() Error!Self {
            var self = Self{};
            @memset(&self.event_fds, -1);

            // Leader
            var group_fd = @as(i32, -1);
            const event_config = events[0].toConfig();
            self.event_fds[0] = try perf_open_group(group_fd, event_config);
            self.event_ids[0] = try ioctl_get_id(self.event_fds[0]);
            group_fd = self.event_fds[0];

            // Siblings
            if (events.len > 1) {
                for (events[1..], 1..) |event, i| {
                    const config = event.toConfig();
                    self.event_fds[i] = try perf_open_group(group_fd, config);
                    self.event_ids[i] = try ioctl_get_id(self.event_fds[i]);
                }
            }
            return self;
        }

        /// Closes all file descriptors associated with this event group.
        /// This invalidates the group object.
        pub fn deinit(self: *Self) void {
            for (self.event_fds, 0..) |event_fd, index| {
                if (event_fd != -1) {
                    _ = linux.close(event_fd);
                }
                self.event_fds[index] = -1;
                self.event_ids[index] = 0;
            }
        }

        /// Resets and enables the event group. Counting begins immediately.
        pub fn enable(self: *Self) Error!void {
            const group_fd = self.event_fds[0];
            if (group_fd == -1) return error.BadGroup;
            try ioctl_reset_group(group_fd);
            try ioctl_enable_group(group_fd);
        }

        /// Disables the event group. Counting stops immediately.
        pub fn disable(self: *Self) Error!void {
            const group_fd = self.event_fds[0];
            if (group_fd == -1) return error.BadGroup;
            try ioctl_disable_group(group_fd);
        }

        /// Reads the current values from the kernel and maps them to the
        /// type-safe output struct.
        ///
        /// This performs the following operations:
        /// 1. Reads the `read_format` binary struct from the leader FD.
        /// 2. Checks `time_enabled` and `time_running` to detect if the CPU
        ///    was oversubscribed.
        /// 3. If multiplexing occurred (time_running < time_enabled), scales
        ///    the raw values: `val = raw_val * (time_enabled / time_running)`
        /// 4. Maps the kernel's event IDs back to the field names of the output
        ///    struct.
        pub fn read(self: *Self) Error!Output {
            var output: Output = std.mem.zeroes(Output);
            var data: ReadFormatGroup = undefined;

            const rc = linux.read(self.event_fds[0], @ptrCast(&data), @sizeOf(ReadFormatGroup));
            if (linux.errno(rc) != .SUCCESS) return error.ReadGroupFailed;

            // If time_running is 0, we can't scale, so return zeros.
            if (data.time_running == 0) return output;

            // Multiplexing scaling: scaled_value = value * (time_enabled / time_running)
            const scale_needed = data.time_running < data.time_enabled;
            const scale_factor = if (scale_needed)
                @as(f64, @floatFromInt(data.time_enabled)) / @as(f64, @floatFromInt(data.time_running))
            else
                1.0;

            for (data.values) |item| {
                var val = item.value;

                if (scale_needed) {
                    val = @as(u64, @intFromFloat(@as(f64, @floatFromInt(val)) * scale_factor));
                }

                // Map the kernel ID back to our event tags
                inline for (events, 0..) |tag, i| {
                    if (item.id == self.event_ids[i]) {
                        @field(output, @tagName(tag)) = val;
                    }
                }
            }

            return output;
        }

        ///////////////////////////////////////////////////////////////////////////////
        // perf & ioctl calls

        // Open new file descriptor for the specific event
        fn perf_open_group(group_fd: linux.fd_t, config: u64) Error!linux.fd_t {
            var attr = std.mem.zeroes(linux.perf_event_attr);
            attr.type = linux.PERF.TYPE.HARDWARE;
            attr.config = config;

            // Enable grouping and ID tracking
            attr.read_format = PERF_FORMAT_GROUP |
                PERF_FORMAT_TOTAL_TIME_ENABLED |
                PERF_FORMAT_TOTAL_TIME_RUNNING |
                PERF_FORMAT_ID;

            attr.flags.disabled = (group_fd == -1); // Only leader starts disabled
            attr.flags.inherit = true;
            attr.flags.exclude_kernel = true;
            attr.flags.exclude_hv = true;

            // ref: `man 2 perf_event_open`
            // pid=0 (current process), cpu=-1 (any cpu), flags=0
            const pid = 0;
            const cpu = -1;
            const flags = 0;

            const rc = linux.perf_event_open(&attr, pid, cpu, group_fd, flags);
            if (linux.errno(rc) != .SUCCESS) return error.OpenGroupFailed;
            return @intCast(rc);
        }

        // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_ID`
        fn ioctl_get_id(fd: linux.fd_t) Error!u64 {
            var id: u64 = 0;
            const rc = linux.ioctl(fd, PERF_EVENT_IOC_ID, @intFromPtr(&id));
            if (linux.errno(rc) != .SUCCESS) return error.GetIdFailed;
            return id;
        }

        // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_RESET`
        fn ioctl_reset_group(fd: linux.fd_t) Error!void {
            const rc = linux.ioctl(fd, PERF_EVENT_IOC_RESET, 0);
            if (linux.errno(rc) != .SUCCESS) return error.ResetGroupFailed;
        }

        // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_ENABLE`
        fn ioctl_enable_group(fd: linux.fd_t) Error!void {
            const rc = linux.ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
            if (linux.errno(rc) != .SUCCESS) return error.EnableGroupFailed;
        }

        // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_DISABLE`
        fn ioctl_disable_group(fd: linux.fd_t) Error!void {
            const rc = linux.ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
            if (linux.errno(rc) != .SUCCESS) return error.DisableGroupFailed;
        }
    };
}

A Super Duper Tiny Contribution

To implement this I needed to get the ID of the event from the file descriptor. The kernel documentation says to use ioctl with PERF_EVENT_IOC_ID.

I checked std.os.linux in the Zig standard library and it was missing.

So I opened a pull request to add it. It is just a one line change.

lib/std/os/linux.zig
pub const ID = 2148017159;

The value is derived from:

Zig
pub const PERF_EVENT_IOC_ID = IOCTL.IOR('$', 7, u64);

You can see the PR here: https://codeberg.org/ziglang/zig/pulls/30162. I am not sure if it will be accepted but it felt good to fix a missing piece in the tool I use.