ghostty/vendor/pixman/pixman/pixman-arma64-neon-asm.h

/*
 * Copyright © 2009 Nokia Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
 */

/*
 * This file contains a macro ('generate_composite_function') which can
 * construct 2D image processing functions, based on a common template.
 * Any combinations of source, destination and mask images with 8bpp,
 * 16bpp, 24bpp, 32bpp color formats are supported.
 *
 * This macro takes care of:
 *  - handling of leading and trailing unaligned pixels
 *  - doing most of the work related to L2 cache preload
 *  - encourages the use of software pipelining for better instructions
 *    scheduling
 *
 * The user of this macro has to provide some configuration parameters
 * (bit depths for the images, prefetch distance, etc.) and a set of
 * macros, which should implement basic code chunks responsible for
 * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage
 * examples.
 *
 * TODO:
 *  - try overlapped pixel method (from Ian Rickards) when processing
 *    exactly two blocks of pixels
 *  - maybe add an option to do reverse scanline processing
 */

/*
 * Bit flags for 'generate_composite_function' macro which are used
 * to tune generated functions behavior.
 */
.set FLAG_DST_WRITEONLY,       0
.set FLAG_DST_READWRITE,       1
.set FLAG_DEINTERLEAVE_32BPP,  2

/*
 * Constants for selecting preferable prefetch type.
 */
.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */

/*
 * prefetch mode
 * available modes are:
 * pldl1keep
 * pldl1strm
 * pldl2keep
 * pldl2strm
 * pldl3keep
 * pldl3strm
 */
#define PREFETCH_MODE pldl1keep

/*
 * Definitions of supplementary pixld/pixst macros (for partial load/store of
 * pixel data).
 */

.macro pixldst1 op, elem_size, reg1, mem_operand, abits
    op {v&reg1&.&elem_size}, [&mem_operand&], #8
.endm

.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
    op {v&reg1&.&elem_size, v&reg2&.&elem_size}, [&mem_operand&], #16
.endm

.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size, v&reg4&.&elem_size}, [&mem_operand&], #32
.endm

.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
    op {v&reg1&.&elem_size}[idx], [&mem_operand&], #&bytes&
.endm

.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}, [&mem_operand&], #24
.endm

.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}[idx], [&mem_operand&], #3
.endm

.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
.if numbytes == 32
    .if elem_size==32
        pixldst4 op, 2s, %(basereg+4), %(basereg+5), \
                              %(basereg+6), %(basereg+7), mem_operand, abits
    .elseif elem_size==16
        pixldst4 op, 4h, %(basereg+4), %(basereg+5), \
                              %(basereg+6), %(basereg+7), mem_operand, abits
    .else
        pixldst4 op, 8b, %(basereg+4), %(basereg+5), \
                              %(basereg+6), %(basereg+7), mem_operand, abits
    .endif
.elseif numbytes == 16
    .if elem_size==32
          pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits
    .elseif elem_size==16
          pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits
    .else
          pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits
    .endif
.elseif numbytes == 8
    .if elem_size==32
        pixldst1 op, 2s, %(basereg+1), mem_operand, abits
    .elseif elem_size==16
        pixldst1 op, 4h, %(basereg+1), mem_operand, abits
    .else
        pixldst1 op, 8b, %(basereg+1), mem_operand, abits
    .endif
.elseif numbytes == 4
    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
        pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4
    .elseif elem_size == 16
        pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2
        pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2
    .else
        pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1
        pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1
        pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1
        pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1
    .endif
.elseif numbytes == 2
    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
        pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2
    .else
        pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1
        pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1
    .endif
.elseif numbytes == 1
        pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1
.else
    .error "unsupported size: numbytes"
.endif
.endm

.macro pixld numpix, bpp, basereg, mem_operand, abits=0
.if bpp > 0
.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \
                      %(basereg+6), %(basereg+7), mem_operand, abits
.elseif (bpp == 24) && (numpix == 8)
    pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
.elseif (bpp == 24) && (numpix == 4)
    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
.elseif (bpp == 24) && (numpix == 2)
    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
.elseif (bpp == 24) && (numpix == 1)
    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
.else
    pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits
.endif
.endif
.endm

.macro pixst numpix, bpp, basereg, mem_operand, abits=0
.if bpp > 0
.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \
                      %(basereg+6), %(basereg+7), mem_operand, abits
.elseif (bpp == 24) && (numpix == 8)
    pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
.elseif (bpp == 24) && (numpix == 4)
    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
.elseif (bpp == 24) && (numpix == 2)
    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
.elseif (bpp == 24) && (numpix == 1)
    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
.elseif numpix * bpp == 32 && abits == 32
    pixldst 4, st1, 32, basereg, mem_operand, abits
.elseif numpix * bpp == 16 && abits == 16
    pixldst 2, st1, 16, basereg, mem_operand, abits
.else
    pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits
.endif
.endif
.endm

.macro pixld_a numpix, bpp, basereg, mem_operand
.if (bpp * numpix) <= 128
    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
.else
    pixld numpix, bpp, basereg, mem_operand, 128
.endif
.endm

.macro pixst_a numpix, bpp, basereg, mem_operand
.if (bpp * numpix) <= 128
    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
.else
    pixst numpix, bpp, basereg, mem_operand, 128
.endif
.endm

/*
 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
 * aliases to be defined)
 */
.macro pixld1_s elem_size, reg1, mem_operand
.if elem_size == 16
    asr     TMP1, VX, #16
    adds    VX, VX, UNIT_X
    bmi     55f
5:  subs    VX, VX, SRC_WIDTH_FIXED
    bpl     5b
55:
    add     TMP1, mem_operand, TMP1, lsl #1
    asr     TMP2, VX, #16
    adds    VX, VX, UNIT_X
    bmi     55f
5:  subs    VX, VX, SRC_WIDTH_FIXED
    bpl     5b
55:
    add     TMP2, mem_operand, TMP2, lsl #1
    ld1     {v&reg1&.h}[0], [TMP1]
    asr     TMP1, VX, #16
    adds    VX, VX, UNIT_X
    bmi     55f
5:  subs    VX, VX, SRC_WIDTH_FIXED
    bpl     5b
55:
    add     TMP1, mem_operand, TMP1, lsl #1
    ld1     {v&reg1&.h}[1], [TMP2]
    asr     TMP2, VX, #16
    adds    VX, VX, UNIT_X
    bmi     55f
5:  subs    VX, VX, SRC_WIDTH_FIXED
    bpl     5b
55:
    add     TMP2, mem_operand, TMP2, lsl #1
    ld1     {v&reg1&.h}[2], [TMP1]
    ld1     {v&reg1&.h}[3], [TMP2]
.elseif elem_size == 32
    asr     TMP1, VX, #16
    adds    VX, VX, UNIT_X
    bmi     55f
5:  subs    VX, VX, SRC_WIDTH_FIXED
    bpl     5b
55:
    add     TMP1, mem_operand, TMP1, lsl #2
    asr     TMP2, VX, #16
    adds    VX, VX, UNIT_X
    bmi     55f
5:  subs    VX, VX, SRC_WIDTH_FIXED
    bpl     5b
55:
    add     TMP2, mem_operand, TMP2, lsl #2
    ld1     {v&reg1&.s}[0], [TMP1]
    ld1     {v&reg1&.s}[1], [TMP2]
.else
    .error "unsupported"
.endif
.endm

.macro pixld2_s elem_size, reg1, reg2, mem_operand
.if 0 /* elem_size == 32 */
    mov     TMP1, VX, asr #16
    add     VX, VX, UNIT_X, asl #1
    add     TMP1, mem_operand, TMP1, asl #2
    mov     TMP2, VX, asr #16
    sub     VX, VX, UNIT_X
    add     TMP2, mem_operand, TMP2, asl #2
    ld1     {v&reg1&.s}[0], [TMP1]
    mov     TMP1, VX, asr #16
    add     VX, VX, UNIT_X, asl #1
    add     TMP1, mem_operand, TMP1, asl #2
    ld1     {v&reg2&.s}[0], [TMP2, :32]
    mov     TMP2, VX, asr #16
    add     VX, VX, UNIT_X
    add     TMP2, mem_operand, TMP2, asl #2
    ld1     {v&reg1&.s}[1], [TMP1]
    ld1     {v&reg2&.s}[1], [TMP2]
.else
    pixld1_s elem_size, reg1, mem_operand
    pixld1_s elem_size, reg2, mem_operand
.endif
.endm

.macro pixld0_s elem_size, reg1, idx, mem_operand
.if elem_size == 16
    asr     TMP1, VX, #16
    adds    VX, VX, UNIT_X
    bmi     55f
5:  subs    VX, VX, SRC_WIDTH_FIXED
    bpl     5b
55:
    add     TMP1, mem_operand, TMP1, lsl #1
    ld1     {v&reg1&.h}[idx], [TMP1]
.elseif elem_size == 32
    asr     DUMMY, VX, #16
    mov     TMP1, DUMMY
    adds    VX, VX, UNIT_X
    bmi     55f
5:  subs    VX, VX, SRC_WIDTH_FIXED
    bpl     5b
55:
    add     TMP1, mem_operand, TMP1, lsl #2
    ld1     {v&reg1&.s}[idx], [TMP1]
.endif
.endm

.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
.if numbytes == 32
    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
    pixdeinterleave elem_size, %(basereg+4)
.elseif numbytes == 16
    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
.elseif numbytes == 8
    pixld1_s elem_size, %(basereg+1), mem_operand
.elseif numbytes == 4
    .if elem_size == 32
        pixld0_s elem_size, %(basereg+0), 1, mem_operand
    .elseif elem_size == 16
        pixld0_s elem_size, %(basereg+0), 2, mem_operand
        pixld0_s elem_size, %(basereg+0), 3, mem_operand
    .else
        pixld0_s elem_size, %(basereg+0), 4, mem_operand
        pixld0_s elem_size, %(basereg+0), 5, mem_operand
        pixld0_s elem_size, %(basereg+0), 6, mem_operand
        pixld0_s elem_size, %(basereg+0), 7, mem_operand
    .endif
.elseif numbytes == 2
    .if elem_size == 16
        pixld0_s elem_size, %(basereg+0), 1, mem_operand
    .else
        pixld0_s elem_size, %(basereg+0), 2, mem_operand
        pixld0_s elem_size, %(basereg+0), 3, mem_operand
    .endif
.elseif numbytes == 1
    pixld0_s elem_size, %(basereg+0), 1, mem_operand
.else
    .error "unsupported size: numbytes"
.endif
.endm

.macro pixld_s numpix, bpp, basereg, mem_operand
.if bpp > 0
    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
.endif
.endm

.macro vuzp8 reg1, reg2
    umov DUMMY, v16.d[0]
    uzp1 v16.8b,     v&reg1&.8b, v&reg2&.8b
    uzp2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
    mov  v&reg1&.8b, v16.8b
    mov  v16.d[0], DUMMY
.endm

.macro vzip8 reg1, reg2
    umov DUMMY, v16.d[0]
    zip1 v16.8b,     v&reg1&.8b, v&reg2&.8b
    zip2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
    mov  v&reg1&.8b, v16.8b
    mov  v16.d[0], DUMMY
.endm

/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
.macro pixdeinterleave bpp, basereg
.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    vuzp8 %(basereg+0), %(basereg+1)
    vuzp8 %(basereg+2), %(basereg+3)
    vuzp8 %(basereg+1), %(basereg+3)
    vuzp8 %(basereg+0), %(basereg+2)
.endif
.endm

/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
.macro pixinterleave bpp, basereg
.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    vzip8 %(basereg+0), %(basereg+2)
    vzip8 %(basereg+1), %(basereg+3)
    vzip8 %(basereg+2), %(basereg+3)
    vzip8 %(basereg+0), %(basereg+1)
.endif
.endm

/*
 * This is a macro for implementing cache preload. The main idea is that
 * cache preload logic is mostly independent from the rest of pixels
 * processing code. It starts at the top left pixel and moves forward
 * across pixels and can jump across scanlines. Prefetch distance is
 * handled in an 'incremental' way: it starts from 0 and advances to the
 * optimal distance over time. After reaching optimal prefetch distance,
 * it is kept constant. There are some checks which prevent prefetching
 * unneeded pixel lines below the image (but it still can prefetch a bit
 * more data on the right side of the image - not a big issue and may
 * be actually helpful when rendering text glyphs). Additional trick is
 * the use of LDR instruction for prefetch instead of PLD when moving to
 * the next line, the point is that we have a high chance of getting TLB
 * miss in this case, and PLD would be useless.
 *
 * This sounds like it may introduce a noticeable overhead (when working with
 * fully cached data). But in reality, due to having a separate pipeline and
 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
 * execute simultaneously with NEON and be completely shadowed by it. Thus
 * we get no performance overhead at all (*). This looks like a very nice
 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
 * but still can implement some rather advanced prefetch logic in software
 * for almost zero cost!
 *
 * (*) The overhead of the prefetcher is visible when running some trivial
 * pixels processing like simple copy. Anyway, having prefetch is a must
 * when working with the graphics data.
 */
.macro PF a, x:vararg
.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
    a x
.endif
.endm

.macro cache_preload std_increment, boost_increment
.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
.if std_increment != 0
    PF add PF_X, PF_X, #std_increment
.endif
    PF tst PF_CTL, #0xF
    PF beq 71f
    PF add PF_X, PF_X, #boost_increment
    PF sub PF_CTL, PF_CTL, #1
71:
    PF cmp PF_X, ORIG_W
.if src_bpp_shift >= 0
    PF lsl DUMMY, PF_X, #src_bpp_shift
    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
.endif
.if dst_r_bpp != 0
    PF lsl DUMMY, PF_X, #dst_bpp_shift
    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
.endif
.if mask_bpp_shift >= 0
    PF lsl DUMMY, PF_X, #mask_bpp_shift
    PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
.endif
    PF ble 71f
    PF sub PF_X, PF_X, ORIG_W
    PF subs PF_CTL, PF_CTL, #0x10
71:
    PF ble 72f
.if src_bpp_shift >= 0
    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
    PF ldrsb DUMMY, [PF_SRC, DUMMY]
    PF add PF_SRC, PF_SRC, #1
.endif
.if dst_r_bpp != 0
    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
    PF ldrsb DUMMY, [PF_DST, DUMMY]
    PF add PF_DST, PF_DST, #1
.endif
.if mask_bpp_shift >= 0
    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
    PF ldrsb DUMMY, [PF_MASK, DUMMY]
    PF add PF_MASK, PF_MASK, #1
.endif
72:
.endif
.endm

.macro cache_preload_simple
.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
.if src_bpp > 0
    prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
.endif
.if dst_r_bpp > 0
    prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
.endif
.if mask_bpp > 0
    prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
.endif
.endif
.endm

.macro fetch_mask_pixblock
    pixld       pixblock_size, mask_bpp, \
                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
.endm

/*
 * Macro which is used to process leading pixels until destination
 * pointer is properly aligned (at 16 bytes boundary). When destination
 * buffer uses 16bpp format, this is unnecessary, or even pointless.
 */
.macro ensure_destination_ptr_alignment process_pixblock_head, \
                                        process_pixblock_tail, \
                                        process_pixblock_tail_head
.if dst_w_bpp != 24
    tst         DST_R, #0xF
    beq         52f

.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
.irp lowbit, 1, 2, 4, 8, 16
local skip1
.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
.if lowbit < 16 /* we don't need more than 16-byte alignment */
    tst         DST_R, #lowbit
    beq         51f
.endif
    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
.if dst_r_bpp > 0
    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
.else
    add         DST_R, DST_R, #lowbit
.endif
    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
    sub         W, W, #(lowbit * 8 / dst_w_bpp)
51:
.endif
.endr
.endif
    pixdeinterleave src_bpp, src_basereg
    pixdeinterleave mask_bpp, mask_basereg
    pixdeinterleave dst_r_bpp, dst_r_basereg

    process_pixblock_head
    cache_preload 0, pixblock_size
    cache_preload_simple
    process_pixblock_tail

    pixinterleave dst_w_bpp, dst_w_basereg

.irp lowbit, 1, 2, 4, 8, 16
.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
.if lowbit < 16 /* we don't need more than 16-byte alignment */
    tst         DST_W, #lowbit
    beq         51f
.endif
.if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
    sub         W, W, #(lowbit * 8 / dst_w_bpp)
.endif
    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
51:
.endif
.endr
.endif
52:
.endm

/*
 * Special code for processing up to (pixblock_size - 1) remaining
 * trailing pixels. As SIMD processing performs operation on
 * pixblock_size pixels, anything smaller than this has to be loaded
 * and stored in a special way. Loading and storing of pixel data is
 * performed in such a way that we fill some 'slots' in the NEON
 * registers (some slots naturally are unused), then perform compositing
 * operation as usual. In the end, the data is taken from these 'slots'
 * and saved to memory.
 *
 * cache_preload_flag - allows to suppress prefetch if
 *                      set to 0
 * dst_aligned_flag   - selects whether destination buffer
 *                      is aligned
 */
.macro process_trailing_pixels cache_preload_flag, \
                               dst_aligned_flag, \
                               process_pixblock_head, \
                               process_pixblock_tail, \
                               process_pixblock_tail_head
    tst         W, #(pixblock_size - 1)
    beq         52f
.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
.irp chunk_size, 16, 8, 4, 2, 1
.if pixblock_size > chunk_size
    tst         W, #chunk_size
    beq         51f
    pixld_src   chunk_size, src_bpp, src_basereg, SRC
    pixld       chunk_size, mask_bpp, mask_basereg, MASK
.if dst_aligned_flag != 0
    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
.else
    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
.endif
.if cache_preload_flag != 0
    PF add      PF_X, PF_X, #chunk_size
.endif
51:
.endif
.endr
.endif
    pixdeinterleave src_bpp, src_basereg
    pixdeinterleave mask_bpp, mask_basereg
    pixdeinterleave dst_r_bpp, dst_r_basereg

    process_pixblock_head
.if cache_preload_flag != 0
    cache_preload 0, pixblock_size
    cache_preload_simple
.endif
    process_pixblock_tail
    pixinterleave dst_w_bpp, dst_w_basereg
.irp chunk_size, 16, 8, 4, 2, 1
.if pixblock_size > chunk_size
    tst         W, #chunk_size
    beq         51f
.if dst_aligned_flag != 0
    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
.else
    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
.endif
51:
.endif
.endr
52:
.endm

/*
 * Macro, which performs all the needed operations to switch to the next
 * scanline and start the next loop iteration unless all the scanlines
 * are already processed.
 */
.macro advance_to_next_scanline start_of_loop_label
    mov         W, ORIG_W
    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
.if src_bpp != 0
    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
.endif
.if mask_bpp != 0
    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
.endif
.if (dst_w_bpp != 24)
    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
.endif
.if (src_bpp != 24) && (src_bpp != 0)
    sub         SRC, SRC, W, lsl #src_bpp_shift
.endif
.if (mask_bpp != 24) && (mask_bpp != 0)
    sub         MASK, MASK, W, lsl #mask_bpp_shift
.endif
    subs        H, H, #1
    mov         DST_R, DST_W
    bge         start_of_loop_label
.endm

/*
 * Registers are allocated in the following way by default:
 * v0, v1, v2, v3     - reserved for loading source pixel data
 * v4, v5, v6, v7     - reserved for loading destination pixel data
 * v24, v25, v26, v27 - reserved for loading mask pixel data
 * v28, v29, v30, v31 - final destination pixel data for writeback to memory
 */
.macro generate_composite_function fname, \
                                   src_bpp_, \
                                   mask_bpp_, \
                                   dst_w_bpp_, \
                                   flags, \
                                   pixblock_size_, \
                                   prefetch_distance, \
                                   init, \
                                   cleanup, \
                                   process_pixblock_head, \
                                   process_pixblock_tail, \
                                   process_pixblock_tail_head, \
                                   dst_w_basereg_ = 28, \
                                   dst_r_basereg_ = 4, \
                                   src_basereg_   = 0, \
                                   mask_basereg_  = 24

    pixman_asm_function fname
    stp         x29, x30, [sp, -16]!
    mov         x29, sp
    sub         sp,   sp, 232  /* push all registers */
    sub         x29, x29, 64
    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
    stp          x8,   x9, [x29, -80]
    stp         x10,  x11, [x29, -96]
    stp         x12,  x13, [x29, -112]
    stp         x14,  x15, [x29, -128]
    stp         x16,  x17, [x29, -144]
    stp         x18,  x19, [x29, -160]
    stp         x20,  x21, [x29, -176]
    stp         x22,  x23, [x29, -192]
    stp         x24,  x25, [x29, -208]
    stp         x26,  x27, [x29, -224]
    str         x28, [x29, -232]

/*
 * Select prefetch type for this function. If prefetch distance is
 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
 * has to be used instead of ADVANCED.
 */
    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
.if prefetch_distance == 0
    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
.endif

/*
 * Make some macro arguments globally visible and accessible
 * from other macros
 */
    .set src_bpp, src_bpp_
    .set mask_bpp, mask_bpp_
    .set dst_w_bpp, dst_w_bpp_
    .set pixblock_size, pixblock_size_
    .set dst_w_basereg, dst_w_basereg_
    .set dst_r_basereg, dst_r_basereg_
    .set src_basereg, src_basereg_
    .set mask_basereg, mask_basereg_

    .macro pixld_src x:vararg
        pixld x
    .endm
    .macro fetch_src_pixblock
        pixld_src   pixblock_size, src_bpp, \
                    (src_basereg - pixblock_size * src_bpp / 64), SRC
    .endm
/*
 * Assign symbolic names to registers
 */
    W           .req       x0      /* width (is updated during processing) */
    H           .req       x1      /* height (is updated during processing) */
    DST_W       .req       x2      /* destination buffer pointer for writes */
    DST_STRIDE  .req       x3      /* destination image stride */
    SRC         .req       x4      /* source buffer pointer */
    SRC_STRIDE  .req       x5      /* source image stride */
    MASK        .req       x6      /* mask pointer */
    MASK_STRIDE .req       x7      /* mask stride */

    DST_R       .req       x8      /* destination buffer pointer for reads */

    PF_CTL      .req       x9      /* combined lines counter and prefetch */
                                    /* distance increment counter */
    PF_X        .req       x10     /* pixel index in a scanline for current */
                                    /* pretetch position */
    PF_SRC      .req       x11     /* pointer to source scanline start */
                                    /* for prefetch purposes */
    PF_DST      .req       x12     /* pointer to destination scanline start */
                                    /* for prefetch purposes */
    PF_MASK     .req       x13     /* pointer to mask scanline start */
                                    /* for prefetch purposes */

    ORIG_W      .req       x14     /* saved original width */
    DUMMY       .req       x15     /* temporary register */

    sxtw        x0, w0
    sxtw        x1, w1
    sxtw        x3, w3
    sxtw        x5, w5
    sxtw        x7, w7

    .set mask_bpp_shift, -1
.if src_bpp == 32
    .set src_bpp_shift, 2
.elseif src_bpp == 24
    .set src_bpp_shift, 0
.elseif src_bpp == 16
    .set src_bpp_shift, 1
.elseif src_bpp == 8
    .set src_bpp_shift, 0
.elseif src_bpp == 0
    .set src_bpp_shift, -1
.else
    .error "requested src bpp (src_bpp) is not supported"
.endif
.if mask_bpp == 32
    .set mask_bpp_shift, 2
.elseif mask_bpp == 24
    .set mask_bpp_shift, 0
.elseif mask_bpp == 8
    .set mask_bpp_shift, 0
.elseif mask_bpp == 0
    .set mask_bpp_shift, -1
.else
    .error "requested mask bpp (mask_bpp) is not supported"
.endif
.if dst_w_bpp == 32
    .set dst_bpp_shift, 2
.elseif dst_w_bpp == 24
    .set dst_bpp_shift, 0
.elseif dst_w_bpp == 16
    .set dst_bpp_shift, 1
.elseif dst_w_bpp == 8
    .set dst_bpp_shift, 0
.else
    .error "requested dst bpp (dst_w_bpp) is not supported"
.endif

.if (((flags) & FLAG_DST_READWRITE) != 0)
    .set dst_r_bpp, dst_w_bpp
.else
    .set dst_r_bpp, 0
.endif
.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
    .set DEINTERLEAVE_32BPP_ENABLED, 1
.else
    .set DEINTERLEAVE_32BPP_ENABLED, 0
.endif

.if prefetch_distance < 0 || prefetch_distance > 15
    .error "invalid prefetch distance (prefetch_distance)"
.endif

    PF mov      PF_X, #0
    mov         DST_R, DST_W

.if src_bpp == 24
    sub         SRC_STRIDE, SRC_STRIDE, W
    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
.endif
.if mask_bpp == 24
    sub         MASK_STRIDE, MASK_STRIDE, W
    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
.endif
.if dst_w_bpp == 24
    sub         DST_STRIDE, DST_STRIDE, W
    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
.endif

/*
 * Setup advanced prefetcher initial state
 */
    PF mov      PF_SRC, SRC
    PF mov      PF_DST, DST_R
    PF mov      PF_MASK, MASK
    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
    PF lsl      DUMMY, H, #4
    PF mov      PF_CTL, DUMMY
    PF add      PF_CTL, PF_CTL, #(prefetch_distance - 0x10)

    init
    subs        H, H, #1
    mov         ORIG_W, W
    blt         9f
    cmp         W, #(pixblock_size * 2)
    blt         800f
/*
 * This is the start of the pipelined loop, which if optimized for
 * long scanlines
 */
0:
    ensure_destination_ptr_alignment process_pixblock_head, \
                                     process_pixblock_tail, \
                                     process_pixblock_tail_head

    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
    pixld_a     pixblock_size, dst_r_bpp, \
                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
    fetch_src_pixblock
    pixld       pixblock_size, mask_bpp, \
                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    PF add      PF_X, PF_X, #pixblock_size
    process_pixblock_head
    cache_preload 0, pixblock_size
    cache_preload_simple
    subs        W, W, #(pixblock_size * 2)
    blt         200f

100:
    process_pixblock_tail_head
    cache_preload_simple
    subs        W, W, #pixblock_size
    bge         100b

200:
    process_pixblock_tail
    pixst_a     pixblock_size, dst_w_bpp, \
                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W

    /* Process the remaining trailing pixels in the scanline */
    process_trailing_pixels 1, 1, \
                            process_pixblock_head, \
                            process_pixblock_tail, \
                            process_pixblock_tail_head
    advance_to_next_scanline 0b

    cleanup
1000:
    /* pop all registers */
    sub         x29, x29, 64
    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    ldp          x8,   x9, [x29, -80]
    ldp         x10,  x11, [x29, -96]
    ldp         x12,  x13, [x29, -112]
    ldp         x14,  x15, [x29, -128]
    ldp         x16,  x17, [x29, -144]
    ldp         x18,  x19, [x29, -160]
    ldp         x20,  x21, [x29, -176]
    ldp         x22,  x23, [x29, -192]
    ldp         x24,  x25, [x29, -208]
    ldp         x26,  x27, [x29, -224]
    ldr         x28, [x29, -232]
    mov         sp, x29
    ldp         x29, x30, [sp], 16
    ret  /* exit */
/*
 * This is the start of the loop, designed to process images with small width
 * (less than pixblock_size * 2 pixels). In this case neither pipelining
 * nor prefetch are used.
 */
800:
.if src_bpp_shift >= 0
    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
    PF prfm PREFETCH_MODE, [SRC, DUMMY]
.endif
.if dst_r_bpp != 0
    PF lsl  DUMMY, DST_STRIDE, #dst_bpp_shift
    PF prfm PREFETCH_MODE, [DST_R, DUMMY]
.endif
.if mask_bpp_shift >= 0
    PF lsl  DUMMY, MASK_STRIDE, #mask_bpp_shift
    PF prfm PREFETCH_MODE, [MASK, DUMMY]
.endif
    /* Process exactly pixblock_size pixels if needed */
    tst         W, #pixblock_size
    beq         100f
    pixld       pixblock_size, dst_r_bpp, \
                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
    fetch_src_pixblock
    pixld       pixblock_size, mask_bpp, \
                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    process_pixblock_head
    process_pixblock_tail
    pixst       pixblock_size, dst_w_bpp, \
                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
100:
    /* Process the remaining trailing pixels in the scanline */
    process_trailing_pixels 0, 0, \
                            process_pixblock_head, \
                            process_pixblock_tail, \
                            process_pixblock_tail_head
    advance_to_next_scanline 800b
9:
    cleanup
    /* pop all registers */
    sub         x29, x29, 64
    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    ldp          x8,   x9, [x29, -80]
    ldp         x10,  x11, [x29, -96]
    ldp         x12,  x13, [x29, -112]
    ldp         x14,  x15, [x29, -128]
    ldp         x16,  x17, [x29, -144]
    ldp         x18,  x19, [x29, -160]
    ldp         x20,  x21, [x29, -176]
    ldp         x22,  x23, [x29, -192]
    ldp         x24,  x25, [x29, -208]
    ldp         x26,  x27, [x29, -224]
    ldr         x28, [x29, -232]
    mov         sp, x29
    ldp         x29, x30, [sp], 16
    ret  /* exit */

    .purgem     fetch_src_pixblock
    .purgem     pixld_src

    .unreq      SRC
    .unreq      MASK
    .unreq      DST_R
    .unreq      DST_W
    .unreq      ORIG_W
    .unreq      W
    .unreq      H
    .unreq      SRC_STRIDE
    .unreq      DST_STRIDE
    .unreq      MASK_STRIDE
    .unreq      PF_CTL
    .unreq      PF_X
    .unreq      PF_SRC
    .unreq      PF_DST
    .unreq      PF_MASK
    .unreq      DUMMY
    .endfunc
.endm

/*
 * A simplified variant of function generation template for a single
 * scanline processing (for implementing pixman combine functions)
 */
.macro generate_composite_function_scanline        use_nearest_scaling, \
                                                   fname, \
                                                   src_bpp_, \
                                                   mask_bpp_, \
                                                   dst_w_bpp_, \
                                                   flags, \
                                                   pixblock_size_, \
                                                   init, \
                                                   cleanup, \
                                                   process_pixblock_head, \
                                                   process_pixblock_tail, \
                                                   process_pixblock_tail_head, \
                                                   dst_w_basereg_ = 28, \
                                                   dst_r_basereg_ = 4, \
                                                   src_basereg_   = 0, \
                                                   mask_basereg_  = 24

    pixman_asm_function fname
    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE

/*
 * Make some macro arguments globally visible and accessible
 * from other macros
 */
    .set src_bpp, src_bpp_
    .set mask_bpp, mask_bpp_
    .set dst_w_bpp, dst_w_bpp_
    .set pixblock_size, pixblock_size_
    .set dst_w_basereg, dst_w_basereg_
    .set dst_r_basereg, dst_r_basereg_
    .set src_basereg, src_basereg_
    .set mask_basereg, mask_basereg_

.if use_nearest_scaling != 0
    /*
     * Assign symbolic names to registers for nearest scaling
     */
    W           .req        x0
    DST_W       .req        x1
    SRC         .req        x2
    VX          .req        x3
    UNIT_X      .req        x4
    SRC_WIDTH_FIXED .req    x5
    MASK        .req        x6
    TMP1        .req        x8
    TMP2        .req        x9
    DST_R       .req        x10
    DUMMY       .req        x30

    .macro pixld_src x:vararg
        pixld_s x
    .endm

    sxtw        x0, w0
    sxtw        x3, w3
    sxtw        x4, w4
    sxtw        x5, w5

    stp         x29, x30, [sp, -16]!
    mov         x29, sp
    sub         sp, sp, 88
    sub         x29, x29, 64
    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    stp         x8, x9, [x29, -80]
    str         x10, [x29, -88]
.else
    /*
     * Assign symbolic names to registers
     */
    W           .req        x0      /* width (is updated during processing) */
    DST_W       .req        x1      /* destination buffer pointer for writes */
    SRC         .req        x2      /* source buffer pointer */
    MASK        .req        x3      /* mask pointer */
    DST_R       .req        x4      /* destination buffer pointer for reads */
    DUMMY       .req        x30

    .macro pixld_src x:vararg
        pixld x
    .endm

    sxtw        x0, w0

    stp         x29, x30, [sp, -16]!
    mov         x29, sp
    sub         sp, sp, 64
    sub         x29, x29, 64
    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
.endif

.if (((flags) & FLAG_DST_READWRITE) != 0)
    .set dst_r_bpp, dst_w_bpp
.else
    .set dst_r_bpp, 0
.endif
.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
    .set DEINTERLEAVE_32BPP_ENABLED, 1
.else
    .set DEINTERLEAVE_32BPP_ENABLED, 0
.endif

    .macro fetch_src_pixblock
        pixld_src   pixblock_size, src_bpp, \
                    (src_basereg - pixblock_size * src_bpp / 64), SRC
    .endm

    init
    mov         DST_R, DST_W

    cmp         W, #pixblock_size
    blt         800f

    ensure_destination_ptr_alignment process_pixblock_head, \
                                     process_pixblock_tail, \
                                     process_pixblock_tail_head

    subs        W, W, #pixblock_size
    blt         700f

    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
    pixld_a     pixblock_size, dst_r_bpp, \
                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
    fetch_src_pixblock
    pixld       pixblock_size, mask_bpp, \
                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    process_pixblock_head
    subs        W, W, #pixblock_size
    blt         200f
100:
    process_pixblock_tail_head
    subs        W, W, #pixblock_size
    bge         100b
200:
    process_pixblock_tail
    pixst_a     pixblock_size, dst_w_bpp, \
                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
700:
    /* Process the remaining trailing pixels in the scanline (dst aligned) */
    process_trailing_pixels 0, 1, \
                            process_pixblock_head, \
                            process_pixblock_tail, \
                            process_pixblock_tail_head

    cleanup
.if use_nearest_scaling != 0
    sub         x29, x29, 64
    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    ldp         x8, x9, [x29, -80]
    ldr         x10, [x29, -96]
    mov         sp, x29
    ldp         x29, x30, [sp], 16
    ret  /* exit */
.else
    sub         x29, x29, 64
    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    mov         sp, x29
    ldp         x29, x30, [sp], 16
    ret  /* exit */
.endif
800:
    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
    process_trailing_pixels 0, 0, \
                            process_pixblock_head, \
                            process_pixblock_tail, \
                            process_pixblock_tail_head

    cleanup
.if use_nearest_scaling != 0
    sub         x29, x29, 64
    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    ldp         x8, x9, [x29, -80]
    ldr         x10, [x29, -88]
    mov         sp, x29
    ldp         x29, x30, [sp], 16
    ret  /* exit */

    .unreq      DUMMY
    .unreq      DST_R
    .unreq      SRC
    .unreq      W
    .unreq      VX
    .unreq      UNIT_X
    .unreq      TMP1
    .unreq      TMP2
    .unreq      DST_W
    .unreq      MASK
    .unreq      SRC_WIDTH_FIXED

.else
    sub         x29, x29, 64
    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    mov          sp, x29
    ldp          x29, x30, [sp], 16
    ret  /* exit */

    .unreq      DUMMY
    .unreq      SRC
    .unreq      MASK
    .unreq      DST_R
    .unreq      DST_W
    .unreq      W
.endif

    .purgem     fetch_src_pixblock
    .purgem     pixld_src

    .endfunc
.endm

.macro generate_composite_function_single_scanline x:vararg
    generate_composite_function_scanline 0, x
.endm

.macro generate_composite_function_nearest_scanline x:vararg
    generate_composite_function_scanline 1, x
.endm

/* Default prologue/epilogue, nothing special needs to be done */

.macro default_init
.endm

.macro default_cleanup
.endm

/*
 * Prologue/epilogue variant which additionally saves/restores v8-v15
 * registers (they need to be saved/restored by callee according to ABI).
 * This is required if the code needs to use all the NEON registers.
 */

.macro default_init_need_all_regs
.endm

.macro default_cleanup_need_all_regs
.endm

/******************************************************************************/

/*
 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
 * into a planar a8r8g8b8 format (with a, r, g, b color components
 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
 *
 * Warning: the conversion is destructive and the original
 *          value (in) is lost.
 */
.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
    shrn        &out_r&.8b, &in&.8h,    #8
    shrn        &out_g&.8b, &in&.8h,    #3
    sli         &in&.8h,    &in&.8h,    #5
    movi        &out_a&.8b, #255
    sri         &out_r&.8b, &out_r&.8b, #5
    sri         &out_g&.8b, &out_g&.8b, #6
    shrn        &out_b&.8b, &in&.8h,    #2
.endm

.macro convert_0565_to_x888 in, out_r, out_g, out_b
    shrn        &out_r&.8b, &in&.8h,    #8
    shrn        &out_g&.8b, &in&.8h,    #3
    sli         &in&.8h,    &in&.8h,    #5
    sri         &out_r&.8b, &out_r&.8b, #5
    sri         &out_g&.8b, &out_g&.8b, #6
    shrn        &out_b&.8b, &in&.8h,    #2
.endm

/*
 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
 * pixels packed in 128-bit register (out). Requires two temporary 128-bit
 * registers (tmp1, tmp2)
 */
.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
    ushll       &tmp1&.8h, &in_g&.8b, #7
    shl         &tmp1&.8h, &tmp1&.8h, #1
    ushll       &out&.8h,  &in_r&.8b, #7
    shl         &out&.8h,  &out&.8h,  #1
    ushll       &tmp2&.8h, &in_b&.8b, #7
    shl         &tmp2&.8h, &tmp2&.8h, #1
    sri         &out&.8h, &tmp1&.8h, #5
    sri         &out&.8h, &tmp2&.8h, #11
.endm

/*
 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
 * returned in (out0, out1) registers pair. Requires one temporary
 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
 * value from 'in' is lost
 */
.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
    shl         &out0&.4h, &in&.4h,   #5  /* G top 6 bits */
    shl         &tmp&.4h,  &in&.4h,   #11 /* B top 5 bits */
    sri         &in&.4h,   &in&.4h,   #5  /* R is ready in top bits */
    sri         &out0&.4h, &out0&.4h, #6  /* G is ready in top bits */
    sri         &tmp&.4h,  &tmp&.4h,  #5  /* B is ready in top bits */
    ushr        &out1&.4h, &in&.4h,   #8  /* R is in place */
    sri         &out0&.4h, &tmp&.4h,  #8  /* G & B is in place */
    zip1        &tmp&.4h,  &out0&.4h, &out1&.4h  /* everything is in place */
    zip2        &out1&.4h, &out0&.4h, &out1&.4h
    mov         &out0&.d[0], &tmp&.d[0]
.endm