diff --git a/src/renderer/Metal.zig b/src/renderer/Metal.zig
index f6cc0aebf..7c3f03aed 100644
--- a/src/renderer/Metal.zig
+++ b/src/renderer/Metal.zig
@@ -55,7 +55,9 @@ foreground: terminal.color.RGB,
 background: terminal.color.RGB,
 
 /// The current set of cells to render. This is rebuilt on every frame
-/// but we keep this around so that we don't reallocate.
+/// but we keep this around so that we don't reallocate. Each set of
+/// cells goes into a separate shader.
+cells_bg: std.ArrayListUnmanaged(GPUCell),
 cells: std.ArrayListUnmanaged(GPUCell),
 
 /// The current GPU uniform values.
@@ -69,6 +71,7 @@ font_shaper: font.Shaper,
 device: objc.Object, // MTLDevice
 queue: objc.Object, // MTLCommandQueue
 swapchain: objc.Object, // CAMetalLayer
+buf_cells_bg: objc.Object, // MTLBuffer
 buf_cells: objc.Object, // MTLBuffer
 buf_instance: objc.Object, // MTLBuffer
 pipeline: objc.Object, // MTLRenderPipelineState
@@ -204,6 +207,21 @@ pub fn init(alloc: Allocator, options: renderer.Options) !Metal {
         );
     };
 
+    const buf_cells_bg = buffer: {
+        // Preallocate for 160x160 grid with 3 modes (bg, fg, text). This
+        // should handle most terminals well, and we can avoid a resize later.
+        const prealloc = 160 * 160;
+
+        break :buffer device.msgSend(
+            objc.Object,
+            objc.sel("newBufferWithLength:options:"),
+            .{
+                @intCast(c_ulong, prealloc * @sizeOf(GPUCell)),
+                MTLResourceStorageModeShared,
+            },
+        );
+    };
+
     // Initialize our shader (MTLLibrary)
     const library = try initLibrary(device, @embedFile("shaders/cell.metal"));
     const pipeline_state = try initPipelineState(device, library);
@@ -222,6 +240,7 @@ pub fn init(alloc: Allocator, options: renderer.Options) !Metal {
         .cursor_style = .box,
 
         // Render state
+        .cells_bg = .{},
         .cells = .{},
         .uniforms = .{
             .projection_matrix = undefined,
@@ -241,6 +260,7 @@ pub fn init(alloc: Allocator, options: renderer.Options) !Metal {
         .queue = queue,
         .swapchain = swapchain,
         .buf_cells = buf_cells,
+        .buf_cells_bg = buf_cells_bg,
         .buf_instance = buf_instance,
         .pipeline = pipeline_state,
         .texture_greyscale = texture_greyscale,
@@ -250,6 +270,7 @@ pub fn init(alloc: Allocator, options: renderer.Options) !Metal {
 
 pub fn deinit(self: *Metal) void {
     self.cells.deinit(self.alloc);
+    self.cells_bg.deinit(self.alloc);
 
     self.font_shaper.deinit();
     self.alloc.free(self.font_shaper.cell_buf);
@@ -454,9 +475,6 @@ pub fn render(
     // Get our surface (CAMetalDrawable)
     const surface = self.swapchain.msgSend(objc.Object, objc.sel("nextDrawable"), .{});
 
-    // Setup our buffers
-    try self.syncCells();
-
     // If our font atlas changed, sync the texture data
     if (self.font_group.atlas_greyscale.modified) {
         try syncAtlasTexture(self.device, &self.font_group.atlas_greyscale, &self.texture_greyscale);
@@ -467,42 +485,42 @@ pub fn render(
         self.font_group.atlas_color.modified = false;
     }
 
-    // MTLRenderPassDescriptor
-    const desc = desc: {
-        const MTLRenderPassDescriptor = objc.Class.getClass("MTLRenderPassDescriptor").?;
-        const desc = MTLRenderPassDescriptor.msgSend(
-            objc.Object,
-            objc.sel("renderPassDescriptor"),
-            .{},
-        );
-
-        // Set our color attachment to be our drawable surface.
-        const attachments = objc.Object.fromId(desc.getProperty(?*anyopaque, "colorAttachments"));
-        {
-            const attachment = attachments.msgSend(
-                objc.Object,
-                objc.sel("objectAtIndexedSubscript:"),
-                .{@as(c_ulong, 0)},
-            );
-
-            attachment.setProperty("loadAction", @enumToInt(MTLLoadAction.clear));
-            attachment.setProperty("storeAction", @enumToInt(MTLStoreAction.store));
-            attachment.setProperty("texture", surface.getProperty(objc.c.id, "texture").?);
-            attachment.setProperty("clearColor", MTLClearColor{
-                .red = @intToFloat(f32, critical.bg.r) / 255,
-                .green = @intToFloat(f32, critical.bg.g) / 255,
-                .blue = @intToFloat(f32, critical.bg.b) / 255,
-                .alpha = 1.0,
-            });
-        }
-
-        break :desc desc;
-    };
-
     // Command buffer (MTLCommandBuffer)
     const buffer = self.queue.msgSend(objc.Object, objc.sel("commandBuffer"), .{});
 
     {
+        // MTLRenderPassDescriptor
+        const desc = desc: {
+            const MTLRenderPassDescriptor = objc.Class.getClass("MTLRenderPassDescriptor").?;
+            const desc = MTLRenderPassDescriptor.msgSend(
+                objc.Object,
+                objc.sel("renderPassDescriptor"),
+                .{},
+            );
+
+            // Set our color attachment to be our drawable surface.
+            const attachments = objc.Object.fromId(desc.getProperty(?*anyopaque, "colorAttachments"));
+            {
+                const attachment = attachments.msgSend(
+                    objc.Object,
+                    objc.sel("objectAtIndexedSubscript:"),
+                    .{@as(c_ulong, 0)},
+                );
+
+                attachment.setProperty("loadAction", @enumToInt(MTLLoadAction.clear));
+                attachment.setProperty("storeAction", @enumToInt(MTLStoreAction.store));
+                attachment.setProperty("texture", surface.getProperty(objc.c.id, "texture").?);
+                attachment.setProperty("clearColor", MTLClearColor{
+                    .red = @intToFloat(f32, critical.bg.r) / 255,
+                    .green = @intToFloat(f32, critical.bg.g) / 255,
+                    .blue = @intToFloat(f32, critical.bg.b) / 255,
+                    .alpha = 1.0,
+                });
+            }
+
+            break :desc desc;
+        };
+
         // MTLRenderCommandEncoder
         const encoder = buffer.msgSend(
             objc.Object,
@@ -518,11 +536,6 @@ pub fn render(
         encoder.msgSend(void, objc.sel("setRenderPipelineState:"), .{self.pipeline.value});
 
         // Set our buffers
-        encoder.msgSend(
-            void,
-            objc.sel("setVertexBuffer:offset:atIndex:"),
-            .{ self.buf_cells.value, @as(c_ulong, 0), @as(c_ulong, 0) },
-        );
         encoder.msgSend(
             void,
             objc.sel("setVertexBytes:length:atIndex:"),
@@ -549,18 +562,9 @@ pub fn render(
             },
         );
 
-        encoder.msgSend(
-            void,
-            objc.sel("drawIndexedPrimitives:indexCount:indexType:indexBuffer:indexBufferOffset:instanceCount:"),
-            .{
-                @enumToInt(MTLPrimitiveType.triangle),
-                @as(c_ulong, 6),
-                @enumToInt(MTLIndexType.uint16),
-                self.buf_instance.value,
-                @as(c_ulong, 0),
-                @as(c_ulong, self.cells.items.len),
-            },
-        );
+        // Issue the draw calls for this shader
+        try self.drawCells(encoder, &self.buf_cells_bg, self.cells_bg);
+        try self.drawCells(encoder, &self.buf_cells, self.cells);
 
         // Build our devmode draw data. This sucks because it requires we
         // lock our state mutex but the metal imgui implementation requires
@@ -588,6 +592,38 @@ pub fn render(
     buffer.msgSend(void, objc.sel("commit"), .{});
 }
 
+/// Loads some set of cell data into our buffer and issues a draw call.
+/// This expects all the Metal command encoder state to be setup.
+///
+/// Future: when we move to multiple shaders, this will go away and
+/// we'll have a draw call per-shader.
+fn drawCells(
+    self: *Metal,
+    encoder: objc.Object,
+    buf: *objc.Object,
+    cells: std.ArrayListUnmanaged(GPUCell),
+) !void {
+    try self.syncCells(buf, cells);
+    encoder.msgSend(
+        void,
+        objc.sel("setVertexBuffer:offset:atIndex:"),
+        .{ buf.value, @as(c_ulong, 0), @as(c_ulong, 0) },
+    );
+
+    encoder.msgSend(
+        void,
+        objc.sel("drawIndexedPrimitives:indexCount:indexType:indexBuffer:indexBufferOffset:instanceCount:"),
+        .{
+            @enumToInt(MTLPrimitiveType.triangle),
+            @as(c_ulong, 6),
+            @enumToInt(MTLIndexType.uint16),
+            self.buf_instance.value,
+            @as(c_ulong, 0),
+            @as(c_ulong, cells.items.len),
+        },
+    );
+}
+
 /// Resize the screen.
 pub fn setScreenSize(self: *Metal, _: renderer.ScreenSize) !void {
     // We use the bounds of our view which should be updated by now.
@@ -655,6 +691,10 @@ fn rebuildCells(
     screen: *terminal.Screen,
     draw_cursor: bool,
 ) !void {
+    // Bg cells at most will need space for the visible screen size
+    self.cells_bg.clearRetainingCapacity();
+    try self.cells_bg.ensureTotalCapacity(self.alloc, screen.rows * screen.cols);
+
     // Over-allocate just to ensure we don't allocate again during loops.
     self.cells.clearRetainingCapacity();
     try self.cells.ensureTotalCapacity(
@@ -662,7 +702,7 @@ fn rebuildCells(
 
         // * 3 for background modes and cursor and underlines
         // + 1 for cursor
-        (screen.rows * screen.cols * 3) + 1,
+        (screen.rows * screen.cols * 2) + 1,
     );
 
     // This is the cell that has [mode == .fg] and is underneath our cursor.
@@ -722,6 +762,12 @@ fn rebuildCells(
         cell.color = .{ 0, 0, 0, 255 };
         self.cells.appendAssumeCapacity(cell.*);
     }
+
+    // Some debug mode safety checks
+    if (std.debug.runtime_safety) {
+        for (self.cells_bg.items) |cell| assert(cell.mode == .bg);
+        for (self.cells.items) |cell| assert(cell.mode != .bg);
+    }
 }
 
 pub fn updateCell(
@@ -787,7 +833,7 @@ pub fn updateCell(
 
     // If the cell has a background, we always draw it.
     if (colors.bg) |rgb| {
-        self.cells.appendAssumeCapacity(.{
+        self.cells_bg.appendAssumeCapacity(.{
             .mode = .bg,
             .grid_pos = .{ @intToFloat(f32, x), @intToFloat(f32, y) },
             .cell_width = cell.widthLegacy(),
@@ -863,18 +909,22 @@ fn addCursor(self: *Metal, screen: *terminal.Screen) void {
 /// Sync the vertex buffer inputs to the GPU. This will attempt to reuse
 /// the existing buffer (of course!) but will allocate a new buffer if
 /// our cells don't fit in it.
-fn syncCells(self: *Metal) !void {
-    const req_bytes = self.cells.items.len * @sizeOf(GPUCell);
-    const avail_bytes = self.buf_cells.getProperty(c_ulong, "length");
+fn syncCells(
+    self: *Metal,
+    target: *objc.Object,
+    cells: std.ArrayListUnmanaged(GPUCell),
+) !void {
+    const req_bytes = cells.items.len * @sizeOf(GPUCell);
+    const avail_bytes = target.getProperty(c_ulong, "length");
 
     // If we need more bytes than our buffer has, we need to reallocate.
     if (req_bytes > avail_bytes) {
         // Deallocate previous buffer
-        deinitMTLResource(self.buf_cells);
+        deinitMTLResource(target.*);
 
         // Allocate a new buffer with enough to hold double what we require.
         const size = req_bytes * 2;
-        self.buf_cells = self.device.msgSend(
+        target.* = self.device.msgSend(
             objc.Object,
             objc.sel("newBufferWithLength:options:"),
             .{
@@ -885,12 +935,12 @@ fn syncCells(self: *Metal) !void {
     }
 
     // We can fit within the vertex buffer so we can just replace bytes.
-    const ptr = self.buf_cells.msgSend(?[*]u8, objc.sel("contents"), .{}) orelse {
+    const ptr = target.msgSend(?[*]u8, objc.sel("contents"), .{}) orelse {
         log.warn("buf_cells contents ptr is null", .{});
         return error.MetalFailed;
     };
 
-    @memcpy(ptr, @ptrCast([*]const u8, self.cells.items.ptr), req_bytes);
+    @memcpy(ptr, @ptrCast([*]const u8, cells.items.ptr), req_bytes);
 }
 
 /// Sync the atlas data to the given texture. This copies the bytes
diff --git a/src/renderer/OpenGL.zig b/src/renderer/OpenGL.zig
index ec368906a..d36670917 100644
--- a/src/renderer/OpenGL.zig
+++ b/src/renderer/OpenGL.zig
@@ -36,7 +36,9 @@ alloc: std.mem.Allocator,
 /// Current cell dimensions for this grid.
 cell_size: renderer.CellSize,
 
-/// The current set of cells to render.
+/// The current set of cells to render. Each set of cells goes into
+/// a separate shader call.
+cells_bg: std.ArrayListUnmanaged(GPUCell),
 cells: std.ArrayListUnmanaged(GPUCell),
 
 /// The LRU that stores our GPU cells cached by row IDs. This is used to
@@ -280,6 +282,7 @@ pub fn init(alloc: Allocator, options: renderer.Options) !OpenGL {
 
     return OpenGL{
         .alloc = alloc,
+        .cells_bg = .{},
         .cells = .{},
         .cells_lru = CellsLRU.init(0),
         .cell_size = .{ .width = metrics.cell_width, .height = metrics.cell_height },
@@ -316,6 +319,7 @@ pub fn deinit(self: *OpenGL) void {
     self.cells_lru.deinit(self.alloc);
 
     self.cells.deinit(self.alloc);
+    self.cells_bg.deinit(self.alloc);
     self.* = undefined;
 }
 
@@ -650,6 +654,10 @@ pub fn rebuildCells(
     const t = trace(@src());
     defer t.end();
 
+    // Bg cells at most will need space for the visible screen size
+    self.cells_bg.clearRetainingCapacity();
+    try self.cells_bg.ensureTotalCapacity(self.alloc, screen.rows * screen.cols);
+
     // For now, we just ensure that we have enough cells for all the lines
     // we have plus a full width. This is very likely too much but its
     // the probably close enough while guaranteeing no more allocations.
@@ -659,7 +667,7 @@ pub fn rebuildCells(
 
         // * 3 for background modes and cursor and underlines
         // + 1 for cursor
-        (screen.rows * screen.cols * 3) + 1,
+        (screen.rows * screen.cols * 2) + 1,
     );
 
     // We've written no data to the GPU, refresh it all
@@ -778,6 +786,12 @@ pub fn rebuildCells(
         cell.fg_a = 255;
         self.cells.appendAssumeCapacity(cell.*);
     }
+
+    // Some debug mode safety checks
+    if (std.debug.runtime_safety) {
+        for (self.cells_bg.items) |cell| assert(cell.mode == .bg);
+        for (self.cells.items) |cell| assert(cell.mode != .bg);
+    }
 }
 
 fn addCursor(self: *OpenGL, screen: *terminal.Screen) void {
@@ -886,7 +900,7 @@ pub fn updateCell(
     if (colors.bg) |rgb| {
         var mode: GPUCellMode = .bg;
 
-        self.cells.appendAssumeCapacity(.{
+        self.cells_bg.appendAssumeCapacity(.{
             .mode = mode,
             .grid_col = @intCast(u16, x),
             .grid_row = @intCast(u16, y),
@@ -1145,9 +1159,6 @@ pub fn draw(self: *OpenGL) !void {
     // If we have no cells to render, then we render nothing.
     if (self.cells.items.len == 0) return;
 
-    const pbind = try self.program.use();
-    defer pbind.unbind();
-
     // Setup our VAO
     try self.vao.bind();
     defer gl.VertexArray.unbind() catch null;
@@ -1160,34 +1171,6 @@ pub fn draw(self: *OpenGL) !void {
     var binding = try self.vbo.bind(.ArrayBuffer);
     defer binding.unbind();
 
-    // Our allocated buffer on the GPU is smaller than our capacity.
-    // We reallocate a new buffer with the full new capacity.
-    if (self.gl_cells_size < self.cells.capacity) {
-        log.info("reallocating GPU buffer old={} new={}", .{
-            self.gl_cells_size,
-            self.cells.capacity,
-        });
-
-        try binding.setDataNullManual(
-            @sizeOf(GPUCell) * self.cells.capacity,
-            .StaticDraw,
-        );
-
-        self.gl_cells_size = self.cells.capacity;
-        self.gl_cells_written = 0;
-    }
-
-    // If we have data to write to the GPU, send it.
-    if (self.gl_cells_written < self.cells.items.len) {
-        const data = self.cells.items[self.gl_cells_written..];
-        //log.info("sending {} cells to GPU", .{data.len});
-        try binding.setSubData(self.gl_cells_written * @sizeOf(GPUCell), data);
-
-        self.gl_cells_written += data.len;
-        assert(data.len > 0);
-        assert(self.gl_cells_written <= self.cells.items.len);
-    }
-
     // Bind our textures
     try gl.Texture.active(gl.c.GL_TEXTURE0);
     var texbind = try self.texture.bind(.@"2D");
@@ -1197,10 +1180,59 @@ pub fn draw(self: *OpenGL) !void {
     var texbind1 = try self.texture_color.bind(.@"2D");
     defer texbind1.unbind();
 
+    // Pick our shader to use
+    const pbind = try self.program.use();
+    defer pbind.unbind();
+
+    try self.drawCells(binding, self.cells_bg);
+    try self.drawCells(binding, self.cells);
+}
+
+/// Loads some set of cell data into our buffer and issues a draw call.
+/// This expects all the OpenGL state to be setup.
+///
+/// Future: when we move to multiple shaders, this will go away and
+/// we'll have a draw call per-shader.
+fn drawCells(
+    self: *OpenGL,
+    binding: gl.Buffer.Binding,
+    cells: std.ArrayListUnmanaged(GPUCell),
+) !void {
+    // Todo: get rid of this completely
+    self.gl_cells_written = 0;
+
+    // Our allocated buffer on the GPU is smaller than our capacity.
+    // We reallocate a new buffer with the full new capacity.
+    if (self.gl_cells_size < cells.capacity) {
+        log.info("reallocating GPU buffer old={} new={}", .{
+            self.gl_cells_size,
+            cells.capacity,
+        });
+
+        try binding.setDataNullManual(
+            @sizeOf(GPUCell) * cells.capacity,
+            .StaticDraw,
+        );
+
+        self.gl_cells_size = cells.capacity;
+        self.gl_cells_written = 0;
+    }
+
+    // If we have data to write to the GPU, send it.
+    if (self.gl_cells_written < cells.items.len) {
+        const data = cells.items[self.gl_cells_written..];
+        //log.info("sending {} cells to GPU", .{data.len});
+        try binding.setSubData(self.gl_cells_written * @sizeOf(GPUCell), data);
+
+        self.gl_cells_written += data.len;
+        assert(data.len > 0);
+        assert(self.gl_cells_written <= cells.items.len);
+    }
+
     try gl.drawElementsInstanced(
         gl.c.GL_TRIANGLES,
         6,
         gl.c.GL_UNSIGNED_BYTE,
-        self.cells.items.len,
+        cells.items.len,
     );
 }