From ae8859bc7bcf0267c00dfb0c136c40be4c1fc0f4 Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <m@mitchellh.com>
Date: Fri, 6 Sep 2024 21:50:47 -0700
Subject: [PATCH] crash/minidump: read the streams from the minidump file

---
 src/crash/minidump.zig | 118 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 112 insertions(+), 6 deletions(-)

diff --git a/src/crash/minidump.zig b/src/crash/minidump.zig
index caec0f1ea..fbd4ec809 100644
--- a/src/crash/minidump.zig
+++ b/src/crash/minidump.zig
@@ -4,23 +4,108 @@ const Allocator = std.mem.Allocator;
 
 const log = std.log.scoped(.minidump);
 
-/// Minidump parser.
+/// Minidump file format.
 pub const Minidump = struct {
+    /// The arena that all streams are allocated within when reading the
+    /// minidump file. This is freed on deinit.
+    arena: std.heap.ArenaAllocator,
+
+    /// The header of the minidump file. On serialization, the stream count
+    /// and rva will be updated to match the streams. On deserialization,
+    /// this is read directly from the file.
     header: Header,
 
+    /// The streams within the minidump file in the order they're serialized.
+    streams: std.ArrayListUnmanaged(Stream),
+
+    pub const Stream = struct {
+        type: u32,
+        data: []const u8,
+    };
+
     /// Read the minidump file for the given source.
     ///
     /// The source must have a reader() and seekableStream() method.
     /// For example, both File and std.io.FixedBufferStream implement these.
-    pub fn read(alloc: Allocator, source: anytype) !Minidump {
-        _ = alloc;
+    ///
+    /// The reader will read the full minidump data into memory. This makes
+    /// it easy to serialize the data back out. This is acceptable for our
+    /// use case which doesn't rely too much on being memory efficient or
+    /// high load. We also expect the minidump files to be relatively small
+    /// (dozens of MB at most, hundreds of KB typically).
+    ///
+    /// NOTE(mitchellh): If we ever want to make this more memory efficient,
+    /// I would create a new type that is a "lazy reader" that stores the
+    /// source type and reads the data as needed. Then this type should use
+    /// that type.
+    pub fn read(alloc_gpa: Allocator, source: anytype) !Minidump {
+        var arena = std.heap.ArenaAllocator.init(alloc_gpa);
+        errdefer arena.deinit();
+        const alloc = arena.allocator();
 
         // Read the header which also determines the endianness of the file.
         const header, const endian = try readHeader(source);
-        log.warn("header={} endian={}", .{ header, endian });
+
+        var streams = try std.ArrayListUnmanaged(Stream).initCapacity(
+            alloc,
+            header.stream_count,
+        );
+        errdefer streams.deinit(alloc);
+
+        // Read the streams. All the streams are first described in a
+        // "directory" structure which tells us the type of stream and
+        // where it is located in the file. The directory structures are
+        // stored in a contiguous block at the stream_directory_rva.
+        //
+        // Due to how we use this structure, we read directories one by one,
+        // then read all the data for that directory, then move on to the
+        // next directory. This is because we copy all the minidump data
+        // into memory.
+        const seeker = source.seekableStream();
+        try seeker.seekTo(header.stream_directory_rva);
+        for (0..header.stream_count) |_| {
+            // Read the current directory
+            const directory = try source.reader().readStructEndian(Directory, endian);
+
+            // Seek to the location of the data. We have to store our current
+            // position because we need to seek back to it after reading the
+            // data in order to read the next directory.
+            const pos = try seeker.getPos();
+            try seeker.seekTo(directory.location.rva);
+
+            // Read the data. The data length is defined by the directory.
+            // If we can't read exactly that amount of data, we return an error.
+            var data = std.ArrayList(u8).init(alloc);
+            defer data.deinit();
+            source.reader().readAllArrayList(
+                &data,
+                directory.location.data_size,
+            ) catch |err| switch (err) {
+                // This means there was more data in the reader than what
+                // we asked for this. This is okay and expected because
+                // all streams except the last one will have this error.
+                error.StreamTooLong => {},
+                else => return err,
+            };
+
+            // Basic check.
+            if (data.items.len != directory.location.data_size) return error.DataSizeMismatch;
+
+            // Store our stream
+            try streams.append(alloc, .{
+                .type = directory.stream_type,
+                .data = try data.toOwnedSlice(),
+            });
+
+            // Seek back to where we were after reading this directory
+            // entry so we can read the next one.
+            try seeker.seekTo(pos);
+        }
 
         return .{
+            .arena = arena,
             .header = header,
+            .streams = streams,
         };
     }
 
@@ -48,8 +133,16 @@ pub const Minidump = struct {
 
         return .{ header, endian };
     }
-};
 
+    pub fn deinit(self: *Minidump) void {
+        self.arena.deinit();
+    }
+
+    /// The arena allocator associated with this envelope
+    pub fn allocator(self: *Minidump) Allocator {
+        return self.arena.allocator();
+    }
+};
 /// "MDMP" in little-endian.
 pub const signature = 0x504D444D;
 
@@ -67,9 +160,22 @@ pub const Header = extern struct {
     flags: u64,
 };
 
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_directory
+pub const Directory = extern struct {
+    stream_type: u32,
+    location: LocationDescriptor,
+};
+
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_location_descriptor
+pub const LocationDescriptor = extern struct {
+    data_size: u32,
+    rva: u32,
+};
+
 test "Minidump read" {
     const testing = std.testing;
     const alloc = testing.allocator;
     var fbs = std.io.fixedBufferStream(@embedFile("testdata/macos.dmp"));
-    _ = try Minidump.read(alloc, &fbs);
+    var md = try Minidump.read(alloc, &fbs);
+    defer md.deinit();
 }