From b8ec91242f96cf74f7545de7f70e2c80deb37829 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Sun, 8 Sep 2024 10:07:24 -0700 Subject: [PATCH] crash/minidump: reader that streams data from a source --- src/crash/minidump.zig | 4 +- src/crash/minidump/minidump.zig | 154 ----------------------------- src/crash/minidump/reader.zig | 167 ++++++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 156 deletions(-) delete mode 100644 src/crash/minidump/minidump.zig create mode 100644 src/crash/minidump/reader.zig diff --git a/src/crash/minidump.zig b/src/crash/minidump.zig index 0cf641114..1e103283f 100644 --- a/src/crash/minidump.zig +++ b/src/crash/minidump.zig @@ -1,7 +1,7 @@ -const minidump = @import("minidump/minidump.zig"); +const reader = @import("minidump/reader.zig"); pub const stream = @import("minidump/stream.zig"); -pub const Minidump = minidump.Minidump; +pub const Reader = reader.Reader; test { @import("std").testing.refAllDecls(@This()); diff --git a/src/crash/minidump/minidump.zig b/src/crash/minidump/minidump.zig deleted file mode 100644 index 2056212a7..000000000 --- a/src/crash/minidump/minidump.zig +++ /dev/null @@ -1,154 +0,0 @@ -const std = @import("std"); -const assert = std.debug.assert; -const Allocator = std.mem.Allocator; -const external = @import("external.zig"); -const stream = @import("stream.zig"); -const Stream = stream.Stream; - -const log = std.log.scoped(.minidump); - -/// Minidump file format. -pub const Minidump = struct { - /// The arena that all streams are allocated within when reading the - /// minidump file. This is freed on deinit. - arena: std.heap.ArenaAllocator, - - /// The header of the minidump file. On serialization, the stream count - /// and rva will be updated to match the streams. On deserialization, - /// this is read directly from the file. - header: external.Header, - - /// The streams within the minidump file in the order they're serialized. - streams: std.ArrayListUnmanaged(Stream), - - /// Read the minidump file for the given source. - /// - /// The source must have a reader() and seekableStream() method. - /// For example, both File and std.io.FixedBufferStream implement these. - /// - /// The reader will read the full minidump data into memory. This makes - /// it easy to serialize the data back out. This is acceptable for our - /// use case which doesn't rely too much on being memory efficient or - /// high load. We also expect the minidump files to be relatively small - /// (dozens of MB at most, hundreds of KB typically). - /// - /// NOTE(mitchellh): If we ever want to make this more memory efficient, - /// I would create a new type that is a "lazy reader" that stores the - /// source type and reads the data as needed. Then this type should use - /// that type. - pub fn read(alloc_gpa: Allocator, source: anytype) !Minidump { - var arena = std.heap.ArenaAllocator.init(alloc_gpa); - errdefer arena.deinit(); - const alloc = arena.allocator(); - - // Read the header which also determines the endianness of the file. - const header, const endian = try readHeader(source); - //log.warn("header={} endian={}", .{ header, endian }); - - var streams = try std.ArrayListUnmanaged(Stream).initCapacity( - alloc, - header.stream_count, - ); - errdefer streams.deinit(alloc); - - // Read the streams. All the streams are first described in a - // "directory" structure which tells us the type of stream and - // where it is located in the file. The directory structures are - // stored in a contiguous block at the stream_directory_rva. - // - // Due to how we use this structure, we read directories one by one, - // then read all the data for that directory, then move on to the - // next directory. This is because we copy all the minidump data - // into memory. - const seeker = source.seekableStream(); - try seeker.seekTo(header.stream_directory_rva); - for (0..header.stream_count) |_| { - // Read the current directory - const directory = try source.reader().readStructEndian(external.Directory, endian); - log.warn("directory={}", .{directory}); - - // Seek to the location of the data. We have to store our current - // position because we need to seek back to it after reading the - // data in order to read the next directory. - const pos = try seeker.getPos(); - - try seeker.seekTo(directory.location.rva); - - // Read the data. The data length is defined by the directory. - // If we can't read exactly that amount of data, we return an error. - var data = std.ArrayList(u8).init(alloc); - defer data.deinit(); - source.reader().readAllArrayList( - &data, - directory.location.data_size, - ) catch |err| switch (err) { - // This means there was more data in the reader than what - // we asked for this. This is okay and expected because - // all streams except the last one will have this error. - error.StreamTooLong => {}, - else => return err, - }; - - // Basic check. - if (data.items.len != directory.location.data_size) return error.DataSizeMismatch; - - // Store our stream - try streams.append(alloc, .{ .encoded = .{ - .type = directory.stream_type, - .data = try data.toOwnedSlice(), - } }); - - // Seek back to where we were after reading this directory - // entry so we can read the next one. - try seeker.seekTo(pos); - } - - return .{ - .arena = arena, - .header = header, - .streams = streams, - }; - } - - /// Reads the header for the minidump file and returns endianness of - /// the file. - fn readHeader(source: anytype) !struct { external.Header, std.builtin.Endian } { - // Start by trying LE. - var endian: std.builtin.Endian = .little; - var header = try source.reader().readStructEndian(external.Header, endian); - - // If the signature doesn't match, we assume its BE. - if (header.signature != external.signature) { - // Seek back to the start of the file so we can reread. - try source.seekableStream().seekTo(0); - - // Try BE, if the signature doesn't match, return an error. - endian = .big; - header = try source.reader().readStructEndian(external.Header, endian); - if (header.signature != external.signature) return error.InvalidHeader; - } - - // "The low-order word is MINIDUMP_VERSION. The high-order word is an - // internal value that is implementation specific." - if (header.version.low != external.version) return error.InvalidVersion; - - return .{ header, endian }; - } - - pub fn deinit(self: *Minidump) void { - self.arena.deinit(); - } - - /// The arena allocator associated with this envelope - pub fn allocator(self: *Minidump) Allocator { - return self.arena.allocator(); - } -}; - -test "Minidump read" { - const testing = std.testing; - const alloc = testing.allocator; - var fbs = std.io.fixedBufferStream(@embedFile("../testdata/macos.dmp")); - var md = try Minidump.read(alloc, &fbs); - defer md.deinit(); -} diff --git a/src/crash/minidump/reader.zig b/src/crash/minidump/reader.zig new file mode 100644 index 000000000..0735de048 --- /dev/null +++ b/src/crash/minidump/reader.zig @@ -0,0 +1,167 @@ +const std = @import("std"); +const assert = std.debug.assert; +const Allocator = std.mem.Allocator; +const external = @import("external.zig"); +const stream = @import("stream.zig"); +const EncodedStream = stream.EncodedStream; + +const log = std.log.scoped(.minidump_reader); + +/// Possible minidump-specific errors that can occur when reading a minidump. +/// This isn't the full error set since IO errors can also occur depending +/// on the Source type. +pub const ReadError = error{ + InvalidHeader, + InvalidVersion, +}; + +/// Reader creates a new minidump reader for the given source type. The +/// source must have both a "reader()" and "seekableStream()" function. +/// +/// Given the format of a minidump file, we must keep the source open and +/// continually access it because the format of the minidump is full of +/// pointers and offsets that we must follow depending on the stream types. +/// Also, since we're not aware of all stream types (in fact its impossible +/// to be aware since custom stream types are allowed), its possible any stream +/// type can define their own pointers and offsets. So, the source must always +/// be available so callers can decode the streams as needed. +pub fn Reader(comptime Source: type) type { + return struct { + const Self = @This(); + + /// The source data. + source: Source, + + /// The endianness of the minidump file. This is detected by reading + /// the byte order of the header. + endian: std.builtin.Endian, + + /// The number of streams within the minidump file. This is read from + /// the header and stored here so we can quickly access them. Note + /// the stream types require reading the source; this is an optimization + /// to avoid any allocations on the reader and the caller can choose + /// to store them if they want. + stream_count: u32, + stream_directory_rva: u32, + + const SourceCallable = switch (@typeInfo(Source)) { + .Pointer => |v| v.child, + .Struct => Source, + else => @compileError("Source type must be a pointer or struct"), + }; + + const SourceReader = @typeInfo(@TypeOf(SourceCallable.reader)).Fn.return_type.?; + const SourceSeeker = @typeInfo(@TypeOf(SourceCallable.seekableStream)).Fn.return_type.?; + + /// The reader type for stream reading. This is a LimitedReader so + /// you must still call reader() on the result to get the actual + /// reader to read the data. + pub const StreamReader = std.io.LimitedReader(SourceReader); + + /// Initialize a reader. The source must remain available for the entire + /// lifetime of the reader. The reader does not take ownership of the + /// source so if it has resources that need to be cleaned up, the caller + /// must do so once the reader is no longer needed. + pub fn init(source: Source) !Self { + const header, const endian = try readHeader(Source, source); + return .{ + .source = source, + .endian = endian, + .stream_count = header.stream_count, + .stream_directory_rva = header.stream_directory_rva, + }; + } + + /// Return a StreamReader for the given directory type. This streams + /// from the underlying source so the returned reader is only valid + /// as long as the source is unmodified (i.e. the source is not + /// closed, the source is not seeked, etc.). + pub fn streamReader( + self: *const Self, + dir: external.Directory, + ) SourceSeeker.SeekError!StreamReader { + try self.source.seekableStream().seekTo(dir.location.rva); + return .{ + .inner_reader = self.source.reader(), + .bytes_left = dir.location.data_size, + }; + } + + /// Get the directory entry with the given index. + /// + /// Asserts the index is valid (idx < stream_count). + pub fn directory(self: *const Self, idx: usize) !external.Directory { + assert(idx < self.stream_count); + + // Seek to the directory. + const offset: u32 = @intCast(@sizeOf(external.Directory) * idx); + const rva: u32 = self.stream_directory_rva + offset; + try self.source.seekableStream().seekTo(rva); + + // Read the directory. + return try self.source.reader().readStructEndian( + external.Directory, + self.endian, + ); + } + }; +} + +/// Reads the header for the minidump file and returns endianness of +/// the file. +fn readHeader(comptime T: type, source: T) !struct { + external.Header, + std.builtin.Endian, +} { + // Start by trying LE. + var endian: std.builtin.Endian = .little; + var header = try source.reader().readStructEndian(external.Header, endian); + + // If the signature doesn't match, we assume its BE. + if (header.signature != external.signature) { + // Seek back to the start of the file so we can reread. + try source.seekableStream().seekTo(0); + + // Try BE, if the signature doesn't match, return an error. + endian = .big; + header = try source.reader().readStructEndian(external.Header, endian); + if (header.signature != external.signature) return ReadError.InvalidHeader; + } + + // "The low-order word is MINIDUMP_VERSION. The high-order word is an + // internal value that is implementation specific." + if (header.version.low != external.version) return ReadError.InvalidVersion; + + return .{ header, endian }; +} + +// Uncomment to dump some debug information for a minidump file. +test "Minidump debug" { + var fbs = std.io.fixedBufferStream(@embedFile("../testdata/macos.dmp")); + const r = try Reader(*@TypeOf(fbs)).init(&fbs); + for (0..r.stream_count) |i| { + const dir = try r.directory(i); + log.warn("directory i={} dir={}", .{ i, dir }); + } +} + +test "Minidump read" { + const testing = std.testing; + const alloc = testing.allocator; + + var fbs = std.io.fixedBufferStream(@embedFile("../testdata/macos.dmp")); + const r = try Reader(*@TypeOf(fbs)).init(&fbs); + try testing.expectEqual(std.builtin.Endian.little, r.endian); + try testing.expectEqual(7, r.stream_count); + { + const dir = try r.directory(0); + try testing.expectEqual(3, dir.stream_type); + try testing.expectEqual(584, dir.location.data_size); + + var bytes = std.ArrayList(u8).init(alloc); + defer bytes.deinit(); + var sr = try r.streamReader(dir); + try sr.reader().readAllArrayList(&bytes, std.math.maxInt(usize)); + try testing.expectEqual(584, bytes.items.len); + } +}