From 3cc18b62e7ae7b82009c7dd80bc6de94cc4df8e1 Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <m@mitchellh.com>
Date: Sat, 7 Sep 2024 09:35:43 -0700
Subject: [PATCH] crash/minidump: split out into multiple files

---
 src/crash/minidump.zig          | 183 +-------------------------------
 src/crash/minidump/external.zig |  36 +++++++
 src/crash/minidump/minidump.zig | 154 +++++++++++++++++++++++++++
 src/crash/minidump/stream.zig   |  21 ++++
 4 files changed, 216 insertions(+), 178 deletions(-)
 create mode 100644 src/crash/minidump/external.zig
 create mode 100644 src/crash/minidump/minidump.zig
 create mode 100644 src/crash/minidump/stream.zig

diff --git a/src/crash/minidump.zig b/src/crash/minidump.zig
index fbd4ec809..0cf641114 100644
--- a/src/crash/minidump.zig
+++ b/src/crash/minidump.zig
@@ -1,181 +1,8 @@
-const std = @import("std");
-const assert = std.debug.assert;
-const Allocator = std.mem.Allocator;
+const minidump = @import("minidump/minidump.zig");
 
-const log = std.log.scoped(.minidump);
+pub const stream = @import("minidump/stream.zig");
+pub const Minidump = minidump.Minidump;
 
-/// Minidump file format.
-pub const Minidump = struct {
-    /// The arena that all streams are allocated within when reading the
-    /// minidump file. This is freed on deinit.
-    arena: std.heap.ArenaAllocator,
-
-    /// The header of the minidump file. On serialization, the stream count
-    /// and rva will be updated to match the streams. On deserialization,
-    /// this is read directly from the file.
-    header: Header,
-
-    /// The streams within the minidump file in the order they're serialized.
-    streams: std.ArrayListUnmanaged(Stream),
-
-    pub const Stream = struct {
-        type: u32,
-        data: []const u8,
-    };
-
-    /// Read the minidump file for the given source.
-    ///
-    /// The source must have a reader() and seekableStream() method.
-    /// For example, both File and std.io.FixedBufferStream implement these.
-    ///
-    /// The reader will read the full minidump data into memory. This makes
-    /// it easy to serialize the data back out. This is acceptable for our
-    /// use case which doesn't rely too much on being memory efficient or
-    /// high load. We also expect the minidump files to be relatively small
-    /// (dozens of MB at most, hundreds of KB typically).
-    ///
-    /// NOTE(mitchellh): If we ever want to make this more memory efficient,
-    /// I would create a new type that is a "lazy reader" that stores the
-    /// source type and reads the data as needed. Then this type should use
-    /// that type.
-    pub fn read(alloc_gpa: Allocator, source: anytype) !Minidump {
-        var arena = std.heap.ArenaAllocator.init(alloc_gpa);
-        errdefer arena.deinit();
-        const alloc = arena.allocator();
-
-        // Read the header which also determines the endianness of the file.
-        const header, const endian = try readHeader(source);
-
-        var streams = try std.ArrayListUnmanaged(Stream).initCapacity(
-            alloc,
-            header.stream_count,
-        );
-        errdefer streams.deinit(alloc);
-
-        // Read the streams. All the streams are first described in a
-        // "directory" structure which tells us the type of stream and
-        // where it is located in the file. The directory structures are
-        // stored in a contiguous block at the stream_directory_rva.
-        //
-        // Due to how we use this structure, we read directories one by one,
-        // then read all the data for that directory, then move on to the
-        // next directory. This is because we copy all the minidump data
-        // into memory.
-        const seeker = source.seekableStream();
-        try seeker.seekTo(header.stream_directory_rva);
-        for (0..header.stream_count) |_| {
-            // Read the current directory
-            const directory = try source.reader().readStructEndian(Directory, endian);
-
-            // Seek to the location of the data. We have to store our current
-            // position because we need to seek back to it after reading the
-            // data in order to read the next directory.
-            const pos = try seeker.getPos();
-            try seeker.seekTo(directory.location.rva);
-
-            // Read the data. The data length is defined by the directory.
-            // If we can't read exactly that amount of data, we return an error.
-            var data = std.ArrayList(u8).init(alloc);
-            defer data.deinit();
-            source.reader().readAllArrayList(
-                &data,
-                directory.location.data_size,
-            ) catch |err| switch (err) {
-                // This means there was more data in the reader than what
-                // we asked for this. This is okay and expected because
-                // all streams except the last one will have this error.
-                error.StreamTooLong => {},
-                else => return err,
-            };
-
-            // Basic check.
-            if (data.items.len != directory.location.data_size) return error.DataSizeMismatch;
-
-            // Store our stream
-            try streams.append(alloc, .{
-                .type = directory.stream_type,
-                .data = try data.toOwnedSlice(),
-            });
-
-            // Seek back to where we were after reading this directory
-            // entry so we can read the next one.
-            try seeker.seekTo(pos);
-        }
-
-        return .{
-            .arena = arena,
-            .header = header,
-            .streams = streams,
-        };
-    }
-
-    /// Reads the header for the minidump file and returns endianness of
-    /// the file.
-    fn readHeader(source: anytype) !struct { Header, std.builtin.Endian } {
-        // Start by trying LE.
-        var endian: std.builtin.Endian = .little;
-        var header = try source.reader().readStructEndian(Header, endian);
-
-        // If the signature doesn't match, we assume its BE.
-        if (header.signature != signature) {
-            // Seek back to the start of the file so we can reread.
-            try source.seekableStream().seekTo(0);
-
-            // Try BE, if the signature doesn't match, return an error.
-            endian = .big;
-            header = try source.reader().readStructEndian(Header, endian);
-            if (header.signature != signature) return error.InvalidHeader;
-        }
-
-        // "The low-order word is MINIDUMP_VERSION. The high-order word is an
-        // internal value that is implementation specific."
-        if (header.version.low != version) return error.InvalidVersion;
-
-        return .{ header, endian };
-    }
-
-    pub fn deinit(self: *Minidump) void {
-        self.arena.deinit();
-    }
-
-    /// The arena allocator associated with this envelope
-    pub fn allocator(self: *Minidump) Allocator {
-        return self.arena.allocator();
-    }
-};
-/// "MDMP" in little-endian.
-pub const signature = 0x504D444D;
-
-/// The version of the minidump format.
-pub const version = 0xA793;
-
-/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_header
-pub const Header = extern struct {
-    signature: u32,
-    version: packed struct(u32) { low: u16, high: u16 },
-    stream_count: u32,
-    stream_directory_rva: u32,
-    checksum: u32,
-    time_date_stamp: u32,
-    flags: u64,
-};
-
-/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_directory
-pub const Directory = extern struct {
-    stream_type: u32,
-    location: LocationDescriptor,
-};
-
-/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_location_descriptor
-pub const LocationDescriptor = extern struct {
-    data_size: u32,
-    rva: u32,
-};
-
-test "Minidump read" {
-    const testing = std.testing;
-    const alloc = testing.allocator;
-    var fbs = std.io.fixedBufferStream(@embedFile("testdata/macos.dmp"));
-    var md = try Minidump.read(alloc, &fbs);
-    defer md.deinit();
+test {
+    @import("std").testing.refAllDecls(@This());
 }
diff --git a/src/crash/minidump/external.zig b/src/crash/minidump/external.zig
new file mode 100644
index 000000000..9356a6cb3
--- /dev/null
+++ b/src/crash/minidump/external.zig
@@ -0,0 +1,36 @@
+//! This file contains the external structs and constants for the minidump
+//! format. Most are from the Microsoft documentation on the minidump format:
+//! https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/
+//!
+//! Wherever possible, we also compare our definitions to other projects
+//! such as rust-minidump, libmdmp, breakpad, etc. to ensure we're doing
+//! the right thing.
+
+/// "MDMP" in little-endian.
+pub const signature = 0x504D444D;
+
+/// The version of the minidump format.
+pub const version = 0xA793;
+
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_header
+pub const Header = extern struct {
+    signature: u32,
+    version: packed struct(u32) { low: u16, high: u16 },
+    stream_count: u32,
+    stream_directory_rva: u32,
+    checksum: u32,
+    time_date_stamp: u32,
+    flags: u64,
+};
+
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_directory
+pub const Directory = extern struct {
+    stream_type: u32,
+    location: LocationDescriptor,
+};
+
+/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_location_descriptor
+pub const LocationDescriptor = extern struct {
+    data_size: u32,
+    rva: u32,
+};
diff --git a/src/crash/minidump/minidump.zig b/src/crash/minidump/minidump.zig
new file mode 100644
index 000000000..2056212a7
--- /dev/null
+++ b/src/crash/minidump/minidump.zig
@@ -0,0 +1,154 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+const external = @import("external.zig");
+const stream = @import("stream.zig");
+const Stream = stream.Stream;
+
+const log = std.log.scoped(.minidump);
+
+/// Minidump file format.
+pub const Minidump = struct {
+    /// The arena that all streams are allocated within when reading the
+    /// minidump file. This is freed on deinit.
+    arena: std.heap.ArenaAllocator,
+
+    /// The header of the minidump file. On serialization, the stream count
+    /// and rva will be updated to match the streams. On deserialization,
+    /// this is read directly from the file.
+    header: external.Header,
+
+    /// The streams within the minidump file in the order they're serialized.
+    streams: std.ArrayListUnmanaged(Stream),
+
+    /// Read the minidump file for the given source.
+    ///
+    /// The source must have a reader() and seekableStream() method.
+    /// For example, both File and std.io.FixedBufferStream implement these.
+    ///
+    /// The reader will read the full minidump data into memory. This makes
+    /// it easy to serialize the data back out. This is acceptable for our
+    /// use case which doesn't rely too much on being memory efficient or
+    /// high load. We also expect the minidump files to be relatively small
+    /// (dozens of MB at most, hundreds of KB typically).
+    ///
+    /// NOTE(mitchellh): If we ever want to make this more memory efficient,
+    /// I would create a new type that is a "lazy reader" that stores the
+    /// source type and reads the data as needed. Then this type should use
+    /// that type.
+    pub fn read(alloc_gpa: Allocator, source: anytype) !Minidump {
+        var arena = std.heap.ArenaAllocator.init(alloc_gpa);
+        errdefer arena.deinit();
+        const alloc = arena.allocator();
+
+        // Read the header which also determines the endianness of the file.
+        const header, const endian = try readHeader(source);
+        //log.warn("header={} endian={}", .{ header, endian });
+
+        var streams = try std.ArrayListUnmanaged(Stream).initCapacity(
+            alloc,
+            header.stream_count,
+        );
+        errdefer streams.deinit(alloc);
+
+        // Read the streams. All the streams are first described in a
+        // "directory" structure which tells us the type of stream and
+        // where it is located in the file. The directory structures are
+        // stored in a contiguous block at the stream_directory_rva.
+        //
+        // Due to how we use this structure, we read directories one by one,
+        // then read all the data for that directory, then move on to the
+        // next directory. This is because we copy all the minidump data
+        // into memory.
+        const seeker = source.seekableStream();
+        try seeker.seekTo(header.stream_directory_rva);
+        for (0..header.stream_count) |_| {
+            // Read the current directory
+            const directory = try source.reader().readStructEndian(external.Directory, endian);
+            log.warn("directory={}", .{directory});
+
+            // Seek to the location of the data. We have to store our current
+            // position because we need to seek back to it after reading the
+            // data in order to read the next directory.
+            const pos = try seeker.getPos();
+
+            try seeker.seekTo(directory.location.rva);
+
+            // Read the data. The data length is defined by the directory.
+            // If we can't read exactly that amount of data, we return an error.
+            var data = std.ArrayList(u8).init(alloc);
+            defer data.deinit();
+            source.reader().readAllArrayList(
+                &data,
+                directory.location.data_size,
+            ) catch |err| switch (err) {
+                // This means there was more data in the reader than what
+                // we asked for this. This is okay and expected because
+                // all streams except the last one will have this error.
+                error.StreamTooLong => {},
+                else => return err,
+            };
+
+            // Basic check.
+            if (data.items.len != directory.location.data_size) return error.DataSizeMismatch;
+
+            // Store our stream
+            try streams.append(alloc, .{ .encoded = .{
+                .type = directory.stream_type,
+                .data = try data.toOwnedSlice(),
+            } });
+
+            // Seek back to where we were after reading this directory
+            // entry so we can read the next one.
+            try seeker.seekTo(pos);
+        }
+
+        return .{
+            .arena = arena,
+            .header = header,
+            .streams = streams,
+        };
+    }
+
+    /// Reads the header for the minidump file and returns endianness of
+    /// the file.
+    fn readHeader(source: anytype) !struct { external.Header, std.builtin.Endian } {
+        // Start by trying LE.
+        var endian: std.builtin.Endian = .little;
+        var header = try source.reader().readStructEndian(external.Header, endian);
+
+        // If the signature doesn't match, we assume its BE.
+        if (header.signature != external.signature) {
+            // Seek back to the start of the file so we can reread.
+            try source.seekableStream().seekTo(0);
+
+            // Try BE, if the signature doesn't match, return an error.
+            endian = .big;
+            header = try source.reader().readStructEndian(external.Header, endian);
+            if (header.signature != external.signature) return error.InvalidHeader;
+        }
+
+        // "The low-order word is MINIDUMP_VERSION. The high-order word is an
+        // internal value that is implementation specific."
+        if (header.version.low != external.version) return error.InvalidVersion;
+
+        return .{ header, endian };
+    }
+
+    pub fn deinit(self: *Minidump) void {
+        self.arena.deinit();
+    }
+
+    /// The arena allocator associated with this envelope
+    pub fn allocator(self: *Minidump) Allocator {
+        return self.arena.allocator();
+    }
+};
+
+test "Minidump read" {
+    const testing = std.testing;
+    const alloc = testing.allocator;
+    var fbs = std.io.fixedBufferStream(@embedFile("../testdata/macos.dmp"));
+    var md = try Minidump.read(alloc, &fbs);
+    defer md.deinit();
+}
diff --git a/src/crash/minidump/stream.zig b/src/crash/minidump/stream.zig
new file mode 100644
index 000000000..d607ed82b
--- /dev/null
+++ b/src/crash/minidump/stream.zig
@@ -0,0 +1,21 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+
+/// A stream within the minidump file. A stream can be either in an encoded
+/// form or decoded form. The encoded form are raw bytes and aren't validated
+/// until they're decoded. The decoded form is a structured form of the stream.
+///
+/// The decoded form is more ergonomic to work with but the encoded form is
+/// more efficient to read/write.
+pub const Stream = union(enum) {
+    encoded: EncodedStream,
+};
+
+/// An encoded stream value. It is "encoded" in the sense that it is raw bytes
+/// with a type associated. The raw bytes are not validated to be correct for
+/// the type.
+pub const EncodedStream = struct {
+    type: u32,
+    data: []const u8,
+};