From 3cc18b62e7ae7b82009c7dd80bc6de94cc4df8e1 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Sat, 7 Sep 2024 09:35:43 -0700 Subject: [PATCH] crash/minidump: split out into multiple files --- src/crash/minidump.zig | 183 +------------------------------- src/crash/minidump/external.zig | 36 +++++++ src/crash/minidump/minidump.zig | 154 +++++++++++++++++++++++++++ src/crash/minidump/stream.zig | 21 ++++ 4 files changed, 216 insertions(+), 178 deletions(-) create mode 100644 src/crash/minidump/external.zig create mode 100644 src/crash/minidump/minidump.zig create mode 100644 src/crash/minidump/stream.zig diff --git a/src/crash/minidump.zig b/src/crash/minidump.zig index fbd4ec809..0cf641114 100644 --- a/src/crash/minidump.zig +++ b/src/crash/minidump.zig @@ -1,181 +1,8 @@ -const std = @import("std"); -const assert = std.debug.assert; -const Allocator = std.mem.Allocator; +const minidump = @import("minidump/minidump.zig"); -const log = std.log.scoped(.minidump); +pub const stream = @import("minidump/stream.zig"); +pub const Minidump = minidump.Minidump; -/// Minidump file format. -pub const Minidump = struct { - /// The arena that all streams are allocated within when reading the - /// minidump file. This is freed on deinit. - arena: std.heap.ArenaAllocator, - - /// The header of the minidump file. On serialization, the stream count - /// and rva will be updated to match the streams. On deserialization, - /// this is read directly from the file. - header: Header, - - /// The streams within the minidump file in the order they're serialized. - streams: std.ArrayListUnmanaged(Stream), - - pub const Stream = struct { - type: u32, - data: []const u8, - }; - - /// Read the minidump file for the given source. - /// - /// The source must have a reader() and seekableStream() method. - /// For example, both File and std.io.FixedBufferStream implement these. - /// - /// The reader will read the full minidump data into memory. This makes - /// it easy to serialize the data back out. This is acceptable for our - /// use case which doesn't rely too much on being memory efficient or - /// high load. We also expect the minidump files to be relatively small - /// (dozens of MB at most, hundreds of KB typically). - /// - /// NOTE(mitchellh): If we ever want to make this more memory efficient, - /// I would create a new type that is a "lazy reader" that stores the - /// source type and reads the data as needed. Then this type should use - /// that type. - pub fn read(alloc_gpa: Allocator, source: anytype) !Minidump { - var arena = std.heap.ArenaAllocator.init(alloc_gpa); - errdefer arena.deinit(); - const alloc = arena.allocator(); - - // Read the header which also determines the endianness of the file. - const header, const endian = try readHeader(source); - - var streams = try std.ArrayListUnmanaged(Stream).initCapacity( - alloc, - header.stream_count, - ); - errdefer streams.deinit(alloc); - - // Read the streams. All the streams are first described in a - // "directory" structure which tells us the type of stream and - // where it is located in the file. The directory structures are - // stored in a contiguous block at the stream_directory_rva. - // - // Due to how we use this structure, we read directories one by one, - // then read all the data for that directory, then move on to the - // next directory. This is because we copy all the minidump data - // into memory. - const seeker = source.seekableStream(); - try seeker.seekTo(header.stream_directory_rva); - for (0..header.stream_count) |_| { - // Read the current directory - const directory = try source.reader().readStructEndian(Directory, endian); - - // Seek to the location of the data. We have to store our current - // position because we need to seek back to it after reading the - // data in order to read the next directory. - const pos = try seeker.getPos(); - try seeker.seekTo(directory.location.rva); - - // Read the data. The data length is defined by the directory. - // If we can't read exactly that amount of data, we return an error. - var data = std.ArrayList(u8).init(alloc); - defer data.deinit(); - source.reader().readAllArrayList( - &data, - directory.location.data_size, - ) catch |err| switch (err) { - // This means there was more data in the reader than what - // we asked for this. This is okay and expected because - // all streams except the last one will have this error. - error.StreamTooLong => {}, - else => return err, - }; - - // Basic check. - if (data.items.len != directory.location.data_size) return error.DataSizeMismatch; - - // Store our stream - try streams.append(alloc, .{ - .type = directory.stream_type, - .data = try data.toOwnedSlice(), - }); - - // Seek back to where we were after reading this directory - // entry so we can read the next one. - try seeker.seekTo(pos); - } - - return .{ - .arena = arena, - .header = header, - .streams = streams, - }; - } - - /// Reads the header for the minidump file and returns endianness of - /// the file. - fn readHeader(source: anytype) !struct { Header, std.builtin.Endian } { - // Start by trying LE. - var endian: std.builtin.Endian = .little; - var header = try source.reader().readStructEndian(Header, endian); - - // If the signature doesn't match, we assume its BE. - if (header.signature != signature) { - // Seek back to the start of the file so we can reread. - try source.seekableStream().seekTo(0); - - // Try BE, if the signature doesn't match, return an error. - endian = .big; - header = try source.reader().readStructEndian(Header, endian); - if (header.signature != signature) return error.InvalidHeader; - } - - // "The low-order word is MINIDUMP_VERSION. The high-order word is an - // internal value that is implementation specific." - if (header.version.low != version) return error.InvalidVersion; - - return .{ header, endian }; - } - - pub fn deinit(self: *Minidump) void { - self.arena.deinit(); - } - - /// The arena allocator associated with this envelope - pub fn allocator(self: *Minidump) Allocator { - return self.arena.allocator(); - } -}; -/// "MDMP" in little-endian. -pub const signature = 0x504D444D; - -/// The version of the minidump format. -pub const version = 0xA793; - -/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_header -pub const Header = extern struct { - signature: u32, - version: packed struct(u32) { low: u16, high: u16 }, - stream_count: u32, - stream_directory_rva: u32, - checksum: u32, - time_date_stamp: u32, - flags: u64, -}; - -/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_directory -pub const Directory = extern struct { - stream_type: u32, - location: LocationDescriptor, -}; - -/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_location_descriptor -pub const LocationDescriptor = extern struct { - data_size: u32, - rva: u32, -}; - -test "Minidump read" { - const testing = std.testing; - const alloc = testing.allocator; - var fbs = std.io.fixedBufferStream(@embedFile("testdata/macos.dmp")); - var md = try Minidump.read(alloc, &fbs); - defer md.deinit(); +test { + @import("std").testing.refAllDecls(@This()); } diff --git a/src/crash/minidump/external.zig b/src/crash/minidump/external.zig new file mode 100644 index 000000000..9356a6cb3 --- /dev/null +++ b/src/crash/minidump/external.zig @@ -0,0 +1,36 @@ +//! This file contains the external structs and constants for the minidump +//! format. Most are from the Microsoft documentation on the minidump format: +//! https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ +//! +//! Wherever possible, we also compare our definitions to other projects +//! such as rust-minidump, libmdmp, breakpad, etc. to ensure we're doing +//! the right thing. + +/// "MDMP" in little-endian. +pub const signature = 0x504D444D; + +/// The version of the minidump format. +pub const version = 0xA793; + +/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_header +pub const Header = extern struct { + signature: u32, + version: packed struct(u32) { low: u16, high: u16 }, + stream_count: u32, + stream_directory_rva: u32, + checksum: u32, + time_date_stamp: u32, + flags: u64, +}; + +/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_directory +pub const Directory = extern struct { + stream_type: u32, + location: LocationDescriptor, +}; + +/// https://learn.microsoft.com/en-us/windows/win32/api/minidumpapiset/ns-minidumpapiset-minidump_location_descriptor +pub const LocationDescriptor = extern struct { + data_size: u32, + rva: u32, +}; diff --git a/src/crash/minidump/minidump.zig b/src/crash/minidump/minidump.zig new file mode 100644 index 000000000..2056212a7 --- /dev/null +++ b/src/crash/minidump/minidump.zig @@ -0,0 +1,154 @@ +const std = @import("std"); +const assert = std.debug.assert; +const Allocator = std.mem.Allocator; +const external = @import("external.zig"); +const stream = @import("stream.zig"); +const Stream = stream.Stream; + +const log = std.log.scoped(.minidump); + +/// Minidump file format. +pub const Minidump = struct { + /// The arena that all streams are allocated within when reading the + /// minidump file. This is freed on deinit. + arena: std.heap.ArenaAllocator, + + /// The header of the minidump file. On serialization, the stream count + /// and rva will be updated to match the streams. On deserialization, + /// this is read directly from the file. + header: external.Header, + + /// The streams within the minidump file in the order they're serialized. + streams: std.ArrayListUnmanaged(Stream), + + /// Read the minidump file for the given source. + /// + /// The source must have a reader() and seekableStream() method. + /// For example, both File and std.io.FixedBufferStream implement these. + /// + /// The reader will read the full minidump data into memory. This makes + /// it easy to serialize the data back out. This is acceptable for our + /// use case which doesn't rely too much on being memory efficient or + /// high load. We also expect the minidump files to be relatively small + /// (dozens of MB at most, hundreds of KB typically). + /// + /// NOTE(mitchellh): If we ever want to make this more memory efficient, + /// I would create a new type that is a "lazy reader" that stores the + /// source type and reads the data as needed. Then this type should use + /// that type. + pub fn read(alloc_gpa: Allocator, source: anytype) !Minidump { + var arena = std.heap.ArenaAllocator.init(alloc_gpa); + errdefer arena.deinit(); + const alloc = arena.allocator(); + + // Read the header which also determines the endianness of the file. + const header, const endian = try readHeader(source); + //log.warn("header={} endian={}", .{ header, endian }); + + var streams = try std.ArrayListUnmanaged(Stream).initCapacity( + alloc, + header.stream_count, + ); + errdefer streams.deinit(alloc); + + // Read the streams. All the streams are first described in a + // "directory" structure which tells us the type of stream and + // where it is located in the file. The directory structures are + // stored in a contiguous block at the stream_directory_rva. + // + // Due to how we use this structure, we read directories one by one, + // then read all the data for that directory, then move on to the + // next directory. This is because we copy all the minidump data + // into memory. + const seeker = source.seekableStream(); + try seeker.seekTo(header.stream_directory_rva); + for (0..header.stream_count) |_| { + // Read the current directory + const directory = try source.reader().readStructEndian(external.Directory, endian); + log.warn("directory={}", .{directory}); + + // Seek to the location of the data. We have to store our current + // position because we need to seek back to it after reading the + // data in order to read the next directory. + const pos = try seeker.getPos(); + + try seeker.seekTo(directory.location.rva); + + // Read the data. The data length is defined by the directory. + // If we can't read exactly that amount of data, we return an error. + var data = std.ArrayList(u8).init(alloc); + defer data.deinit(); + source.reader().readAllArrayList( + &data, + directory.location.data_size, + ) catch |err| switch (err) { + // This means there was more data in the reader than what + // we asked for this. This is okay and expected because + // all streams except the last one will have this error. + error.StreamTooLong => {}, + else => return err, + }; + + // Basic check. + if (data.items.len != directory.location.data_size) return error.DataSizeMismatch; + + // Store our stream + try streams.append(alloc, .{ .encoded = .{ + .type = directory.stream_type, + .data = try data.toOwnedSlice(), + } }); + + // Seek back to where we were after reading this directory + // entry so we can read the next one. + try seeker.seekTo(pos); + } + + return .{ + .arena = arena, + .header = header, + .streams = streams, + }; + } + + /// Reads the header for the minidump file and returns endianness of + /// the file. + fn readHeader(source: anytype) !struct { external.Header, std.builtin.Endian } { + // Start by trying LE. + var endian: std.builtin.Endian = .little; + var header = try source.reader().readStructEndian(external.Header, endian); + + // If the signature doesn't match, we assume its BE. + if (header.signature != external.signature) { + // Seek back to the start of the file so we can reread. + try source.seekableStream().seekTo(0); + + // Try BE, if the signature doesn't match, return an error. + endian = .big; + header = try source.reader().readStructEndian(external.Header, endian); + if (header.signature != external.signature) return error.InvalidHeader; + } + + // "The low-order word is MINIDUMP_VERSION. The high-order word is an + // internal value that is implementation specific." + if (header.version.low != external.version) return error.InvalidVersion; + + return .{ header, endian }; + } + + pub fn deinit(self: *Minidump) void { + self.arena.deinit(); + } + + /// The arena allocator associated with this envelope + pub fn allocator(self: *Minidump) Allocator { + return self.arena.allocator(); + } +}; + +test "Minidump read" { + const testing = std.testing; + const alloc = testing.allocator; + var fbs = std.io.fixedBufferStream(@embedFile("../testdata/macos.dmp")); + var md = try Minidump.read(alloc, &fbs); + defer md.deinit(); +} diff --git a/src/crash/minidump/stream.zig b/src/crash/minidump/stream.zig new file mode 100644 index 000000000..d607ed82b --- /dev/null +++ b/src/crash/minidump/stream.zig @@ -0,0 +1,21 @@ +const std = @import("std"); +const assert = std.debug.assert; +const Allocator = std.mem.Allocator; + +/// A stream within the minidump file. A stream can be either in an encoded +/// form or decoded form. The encoded form are raw bytes and aren't validated +/// until they're decoded. The decoded form is a structured form of the stream. +/// +/// The decoded form is more ergonomic to work with but the encoded form is +/// more efficient to read/write. +pub const Stream = union(enum) { + encoded: EncodedStream, +}; + +/// An encoded stream value. It is "encoded" in the sense that it is raw bytes +/// with a type associated. The raw bytes are not validated to be correct for +/// the type. +pub const EncodedStream = struct { + type: u32, + data: []const u8, +};