Merge pull request #2287 from ghostty-org/minidump

Minidump Parser
2025-07-22 11:46:11 +03:00 · 2024-09-22 14:12:12 -07:00
parent 826ba6e46e 9178d1cf38
commit 599934d5de
7 changed files with 456 additions and 0 deletions
--- a/src/crash/main.zig
+++ b/src/crash/main.zig
@ -5,6 +5,7 @@
 const dir = @import("dir.zig");
 const sentry_envelope = @import("sentry_envelope.zig");

+pub const minidump = @import("minidump.zig");
 pub const sentry = @import("sentry.zig");
 pub const Envelope = sentry_envelope.Envelope;
 pub const defaultDir = dir.defaultDir;
--- a/src/crash/minidump.zig
+++ b/src/crash/minidump.zig
@ -0,0 +1,7 @@
+pub const reader = @import("minidump/reader.zig");
+pub const stream = @import("minidump/stream.zig");
+pub const Reader = reader.Reader;
+
+test {
+    @import("std").testing.refAllDecls(@This());
+}
--- a/src/crash/minidump/external.zig
+++ b/src/crash/minidump/external.zig
@ -0,0 +1,59 @@
+//! This file contains the external structs and constants for the minidump
+//! format. Most are from the Microsoft documentation on the minidump format:
+//! https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/
+//!
+//! Wherever possible, we also compare our definitions to other projects
+//! such as rust-minidump, libmdmp, breakpad, etc. to ensure we're doing
+//! the right thing.
+
+/// "MDMP" in little-endian.
+pub const signature = 0x504D444D;
+
+/// The version of the minidump format.
+pub const version = 0xA793;
+
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_header
+pub const Header = extern struct {
+    signature: u32,
+    version: packed struct(u32) { low: u16, high: u16 },
+    stream_count: u32,
+    stream_directory_rva: u32,
+    checksum: u32,
+    time_date_stamp: u32,
+    flags: u64,
+};
+
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_directory
+pub const Directory = extern struct {
+    stream_type: u32,
+    location: LocationDescriptor,
+};
+
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_location_descriptor
+pub const LocationDescriptor = extern struct {
+    data_size: u32,
+    rva: u32,
+};
+
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_memory_descriptor
+pub const MemoryDescriptor = extern struct {
+    start_of_memory_range: u64,
+    memory: LocationDescriptor,
+};
+
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_thread_list
+pub const ThreadList = extern struct {
+    number_of_threads: u32,
+    threads: [1]Thread,
+};
+
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_thread
+pub const Thread = extern struct {
+    thread_id: u32,
+    suspend_count: u32,
+    priority_class: u32,
+    priority: u32,
+    teb: u64,
+    stack: MemoryDescriptor,
+    thread_context: LocationDescriptor,
+};
--- a/src/crash/minidump/reader.zig
+++ b/src/crash/minidump/reader.zig
@ -0,0 +1,242 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+const external = @import("external.zig");
+const stream = @import("stream.zig");
+const EncodedStream = stream.EncodedStream;
+
+const log = std.log.scoped(.minidump_reader);
+
+/// Possible minidump-specific errors that can occur when reading a minidump.
+/// This isn't the full error set since IO errors can also occur depending
+/// on the Source type.
+pub const ReadError = error{
+    InvalidHeader,
+    InvalidVersion,
+    StreamSizeMismatch,
+};
+
+/// Reader creates a new minidump reader for the given source type. The
+/// source must have both a "reader()" and "seekableStream()" function.
+///
+/// Given the format of a minidump file, we must keep the source open and
+/// continually access it because the format of the minidump is full of
+/// pointers and offsets that we must follow depending on the stream types.
+/// Also, since we're not aware of all stream types (in fact its impossible
+/// to be aware since custom stream types are allowed), its possible any stream
+/// type can define their own pointers and offsets. So, the source must always
+/// be available so callers can decode the streams as needed.
+pub fn Reader(comptime S: type) type {
+    return struct {
+        const Self = @This();
+
+        /// The source data.
+        source: Source,
+
+        /// The endianness of the minidump file. This is detected by reading
+        /// the byte order of the header.
+        endian: std.builtin.Endian,
+
+        /// The number of streams within the minidump file. This is read from
+        /// the header and stored here so we can quickly access them. Note
+        /// the stream types require reading the source; this is an optimization
+        /// to avoid any allocations on the reader and the caller can choose
+        /// to store them if they want.
+        stream_count: u32,
+        stream_directory_rva: u32,
+
+        const SourceCallable = switch (@typeInfo(Source)) {
+            .Pointer => |v| v.child,
+            .Struct => Source,
+            else => @compileError("Source type must be a pointer or struct"),
+        };
+
+        const SourceReader = @typeInfo(@TypeOf(SourceCallable.reader)).Fn.return_type.?;
+        const SourceSeeker = @typeInfo(@TypeOf(SourceCallable.seekableStream)).Fn.return_type.?;
+
+        /// A limited reader for reading data from the source.
+        pub const LimitedReader = std.io.LimitedReader(SourceReader);
+
+        /// The source type for the reader.
+        pub const Source = S;
+
+        /// The stream types for reading
+        pub const ThreadList = stream.thread_list.ThreadListReader(Self);
+
+        /// The reader type for stream reading. This has some other methods so
+        /// you must still call reader() on the result to get the actual
+        /// reader to read the data.
+        pub const StreamReader = struct {
+            source: Source,
+            endian: std.builtin.Endian,
+            directory: external.Directory,
+
+            /// Should not be accessed directly. This is setup whenever
+            /// reader() is called.
+            limit_reader: LimitedReader = undefined,
+
+            pub const Reader = LimitedReader.Reader;
+
+            /// Returns a Reader implementation that reads the bytes of the
+            /// stream.
+            ///
+            /// The reader is dependent on the state of Source so any
+            /// state-changing operations on Source will invalidate the
+            /// reader. For example, making another reader, reading another
+            /// stream directory, closing the source, etc.
+            pub fn reader(self: *StreamReader) LimitedReader.Reader {
+                try self.source.seekableStream().seekTo(self.directory.location.rva);
+                self.limit_reader = .{
+                    .inner_reader = self.source.reader(),
+                    .bytes_left = self.directory.location.data_size,
+                };
+                return self.limit_reader.reader();
+            }
+
+            /// Seeks the source to the location of the directory.
+            pub fn seekToPayload(self: *StreamReader) !void {
+                try self.source.seekableStream().seekTo(self.directory.location.rva);
+            }
+        };
+
+        /// Iterator type to read over the streams in the minidump file.
+        pub const StreamIterator = struct {
+            reader: *const Self,
+            i: u32 = 0,
+
+            pub fn next(self: *StreamIterator) !?StreamReader {
+                if (self.i >= self.reader.stream_count) return null;
+                const dir = try self.reader.directory(self.i);
+                self.i += 1;
+                return try self.reader.streamReader(dir);
+            }
+        };
+
+        /// Initialize a reader. The source must remain available for the entire
+        /// lifetime of the reader. The reader does not take ownership of the
+        /// source so if it has resources that need to be cleaned up, the caller
+        /// must do so once the reader is no longer needed.
+        pub fn init(source: Source) !Self {
+            const header, const endian = try readHeader(Source, source);
+            return .{
+                .source = source,
+                .endian = endian,
+                .stream_count = header.stream_count,
+                .stream_directory_rva = header.stream_directory_rva,
+            };
+        }
+
+        /// Return an iterator to read over the streams in the minidump file.
+        /// This is very similar to using a simple for loop to stream_count
+        /// and calling directory() on each index, but is more idiomatic
+        /// Zig.
+        pub fn streamIterator(self: *const Self) StreamIterator {
+            return .{ .reader = self };
+        }
+
+        /// Return a StreamReader for the given directory type. This streams
+        /// from the underlying source so the returned reader is only valid
+        /// as long as the source is unmodified (i.e. the source is not
+        /// closed, the source seek position is not moved, etc.).
+        pub fn streamReader(
+            self: *const Self,
+            dir: external.Directory,
+        ) SourceSeeker.SeekError!StreamReader {
+            return .{
+                .source = self.source,
+                .endian = self.endian,
+                .directory = dir,
+            };
+        }
+
+        /// Get the directory entry with the given index.
+        ///
+        /// Asserts the index is valid (idx < stream_count).
+        pub fn directory(self: *const Self, idx: usize) !external.Directory {
+            assert(idx < self.stream_count);
+
+            // Seek to the directory.
+            const offset: u32 = @intCast(@sizeOf(external.Directory) * idx);
+            const rva: u32 = self.stream_directory_rva + offset;
+            try self.source.seekableStream().seekTo(rva);
+
+            // Read the directory.
+            return try self.source.reader().readStructEndian(
+                external.Directory,
+                self.endian,
+            );
+        }
+
+        /// Return a reader for the given location descriptor. This is only
+        /// valid until the reader source is modified in some way.
+        pub fn locationReader(
+            self: *const Self,
+            loc: external.LocationDescriptor,
+        ) !LimitedReader {
+            try self.source.seekableStream().seekTo(loc.rva);
+            return .{
+                .inner_reader = self.source.reader(),
+                .bytes_left = loc.data_size,
+            };
+        }
+    };
+}
+
+/// Reads the header for the minidump file and returns endianness of
+/// the file.
+fn readHeader(comptime T: type, source: T) !struct {
+    external.Header,
+    std.builtin.Endian,
+} {
+    // Start by trying LE.
+    var endian: std.builtin.Endian = .little;
+    var header = try source.reader().readStructEndian(external.Header, endian);
+
+    // If the signature doesn't match, we assume its BE.
+    if (header.signature != external.signature) {
+        // Seek back to the start of the file so we can reread.
+        try source.seekableStream().seekTo(0);
+
+        // Try BE, if the signature doesn't match, return an error.
+        endian = .big;
+        header = try source.reader().readStructEndian(external.Header, endian);
+        if (header.signature != external.signature) return ReadError.InvalidHeader;
+    }
+
+    // "The low-order word is MINIDUMP_VERSION. The high-order word is an
+    // internal value that is implementation specific."
+    if (header.version.low != external.version) return ReadError.InvalidVersion;
+
+    return .{ header, endian };
+}
+
+// Uncomment to dump some debug information for a minidump file.
+test "minidump debug" {
+    var fbs = std.io.fixedBufferStream(@embedFile("../testdata/macos.dmp"));
+    const r = try Reader(*@TypeOf(fbs)).init(&fbs);
+    var it = r.streamIterator();
+    while (try it.next()) |s| {
+        log.warn("directory i={} dir={}", .{ it.i - 1, s.directory });
+    }
+}
+
+test "minidump read" {
+    const testing = std.testing;
+    const alloc = testing.allocator;
+
+    var fbs = std.io.fixedBufferStream(@embedFile("../testdata/macos.dmp"));
+    const r = try Reader(*@TypeOf(fbs)).init(&fbs);
+    try testing.expectEqual(std.builtin.Endian.little, r.endian);
+    try testing.expectEqual(7, r.stream_count);
+    {
+        const dir = try r.directory(0);
+        try testing.expectEqual(3, dir.stream_type);
+        try testing.expectEqual(584, dir.location.data_size);
+
+        var bytes = std.ArrayList(u8).init(alloc);
+        defer bytes.deinit();
+        var sr = try r.streamReader(dir);
+        try sr.reader().readAllArrayList(&bytes, std.math.maxInt(usize));
+        try testing.expectEqual(584, bytes.items.len);
+    }
+}
--- a/src/crash/minidump/stream.zig
+++ b/src/crash/minidump/stream.zig
@ -0,0 +1,30 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+
+const log = std.log.scoped(.minidump_stream);
+
+/// The known stream types.
+pub const thread_list = @import("stream_threadlist.zig");
+
+/// A stream within the minidump file. A stream can be either in an encoded
+/// form or decoded form. The encoded form are raw bytes and aren't validated
+/// until they're decoded. The decoded form is a structured form of the stream.
+///
+/// The decoded form is more ergonomic to work with but the encoded form is
+/// more efficient to read/write.
+pub const Stream = union(enum) {
+    encoded: EncodedStream,
+};
+
+/// An encoded stream value. It is "encoded" in the sense that it is raw bytes
+/// with a type associated. The raw bytes are not validated to be correct for
+/// the type.
+pub const EncodedStream = struct {
+    type: u32,
+    data: []const u8,
+};
+
+test {
+    @import("std").testing.refAllDecls(@This());
+}
--- a/src/crash/minidump/stream_threadlist.zig
+++ b/src/crash/minidump/stream_threadlist.zig
@ -0,0 +1,117 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const external = @import("external.zig");
+const readerpkg = @import("reader.zig");
+const Reader = readerpkg.Reader;
+const ReadError = readerpkg.ReadError;
+
+const log = std.log.scoped(.minidump_stream);
+
+/// This is the list of threads from the process.
+///
+/// This is the Reader implementation. You usually do not use this directly.
+/// Instead, use Reader(T).ThreadList which will get you the same thing.
+///
+/// ThreadList is stream type 0x3.
+/// StreamReader is the Reader(T).StreamReader type.
+pub fn ThreadListReader(comptime R: type) type {
+    return struct {
+        const Self = @This();
+
+        /// The number of threads in the list.
+        count: u32,
+
+        /// The rva to the first thread in the list.
+        rva: u32,
+
+        /// Source data and endianness so we can read.
+        source: R.Source,
+        endian: std.builtin.Endian,
+
+        pub fn init(r: *R.StreamReader) !Self {
+            assert(r.directory.stream_type == 0x3);
+            try r.seekToPayload();
+            const reader = r.source.reader();
+
+            // Our count is always a u32 in the header.
+            const count = try reader.readInt(u32, r.endian);
+
+            // Determine if we have padding in our header. It is possible
+            // for there to be padding if the list header was written by
+            // a 32-bit process but is being read on a 64-bit process.
+            const padding = padding: {
+                const maybe_size = @sizeOf(u32) + (@sizeOf(external.Thread) * count);
+                switch (std.math.order(maybe_size, r.directory.location.data_size)) {
+                    // It should never be larger than what the directory says.
+                    .gt => return ReadError.StreamSizeMismatch,
+
+                    // If the sizes match exactly we're good.
+                    .eq => break :padding 0,
+
+                    .lt => {
+                        const padding = r.directory.location.data_size - maybe_size;
+                        if (padding != 4) return ReadError.StreamSizeMismatch;
+                        break :padding padding;
+                    },
+                }
+            };
+
+            // Rva is the location of the first thread in the list.
+            const rva = r.directory.location.rva + @as(u32, @sizeOf(u32)) + padding;
+
+            return .{
+                .count = count,
+                .rva = rva,
+                .source = r.source,
+                .endian = r.endian,
+            };
+        }
+
+        /// Get the thread entry for the given index.
+        ///
+        /// Index is asserted to be less than count.
+        pub fn thread(self: *const Self, i: usize) !external.Thread {
+            assert(i < self.count);
+
+            // Seek to the thread
+            const offset: u32 = @intCast(@sizeOf(external.Thread) * i);
+            const rva: u32 = self.rva + offset;
+            try self.source.seekableStream().seekTo(rva);
+
+            // Read the thread
+            return try self.source.reader().readStructEndian(
+                external.Thread,
+                self.endian,
+            );
+        }
+    };
+}
+
+test "minidump: threadlist" {
+    const testing = std.testing;
+    const alloc = testing.allocator;
+
+    var fbs = std.io.fixedBufferStream(@embedFile("../testdata/macos.dmp"));
+    const R = Reader(*@TypeOf(fbs));
+    const r = try R.init(&fbs);
+
+    // Get our thread list stream
+    const dir = try r.directory(0);
+    try testing.expectEqual(3, dir.stream_type);
+    var sr = try r.streamReader(dir);
+
+    // Get our rich structure
+    const v = try R.ThreadList.init(&sr);
+    log.warn("threadlist count={} rva={}", .{ v.count, v.rva });
+
+    try testing.expectEqual(12, v.count);
+    for (0..v.count) |i| {
+        const t = try v.thread(i);
+        log.warn("thread i={} thread={}", .{ i, t });
+
+        // Read our stack memory
+        var stack_reader = try r.locationReader(t.stack.memory);
+        const bytes = try stack_reader.reader().readAllAlloc(alloc, t.stack.memory.data_size);
+        defer alloc.free(bytes);
+    }
+}
--- a/src/crash/testdata/macos.dmp
+++ b/src/crash/testdata/macos.dmp