Implement YCbCr->RGB565 using Arm NEON intrinsics
Adds an Arm NEON intrinsics implementation of YCbCr -> RGB565 color
conversion.
Removes both the AArch32 and AArch64 NEON assembly implementations.
Bug: 922430
Change-Id: I3955b9b20bdddc7c9cefe8a8b2a54a0534e58dd5
diff --git a/README.chromium b/README.chromium
index 7e98312..34b867e 100644
--- a/README.chromium
+++ b/README.chromium
@@ -59,6 +59,7 @@
- Add Arm NEON implementation of h2v1_upsample
- Add Arm NEON implementation of h2v2_upsample
- Implement YCbCr->RGB using Arm NEON intrinsics
+ - Implement YCbCr->RGB565 using Arm NEON intrinsics
Refer to working-with-nested-repos [1] for details of how to setup your git
svn client to update the code (for making local changes, cherry picking from
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
index a5b736b..0e05819 100644
--- a/simd/arm/arm/jsimd_neon.S
+++ b/simd/arm/arm/jsimd_neon.S
@@ -1270,282 +1270,6 @@
/*****************************************************************************/
/*
- * jsimd_ycc_rgb565_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB565
- */
-
-
-.macro do_load size
- .if \size == 8
- vld1.8 {d4}, [U, :64]!
- vld1.8 {d5}, [V, :64]!
- vld1.8 {d0}, [Y, :64]!
- pld [U, #64]
- pld [V, #64]
- pld [Y, #64]
- .elseif \size == 4
- vld1.8 {d4[0]}, [U]!
- vld1.8 {d4[1]}, [U]!
- vld1.8 {d4[2]}, [U]!
- vld1.8 {d4[3]}, [U]!
- vld1.8 {d5[0]}, [V]!
- vld1.8 {d5[1]}, [V]!
- vld1.8 {d5[2]}, [V]!
- vld1.8 {d5[3]}, [V]!
- vld1.8 {d0[0]}, [Y]!
- vld1.8 {d0[1]}, [Y]!
- vld1.8 {d0[2]}, [Y]!
- vld1.8 {d0[3]}, [Y]!
- .elseif \size == 2
- vld1.8 {d4[4]}, [U]!
- vld1.8 {d4[5]}, [U]!
- vld1.8 {d5[4]}, [V]!
- vld1.8 {d5[5]}, [V]!
- vld1.8 {d0[4]}, [Y]!
- vld1.8 {d0[5]}, [Y]!
- .elseif \size == 1
- vld1.8 {d4[6]}, [U]!
- vld1.8 {d5[6]}, [V]!
- vld1.8 {d0[6]}, [Y]!
- .else
- .error unsupported macroblock size
- .endif
-.endm
-
-.macro do_store bpp, size
- .if \bpp == 16
- .if \size == 8
- vst1.16 {q15}, [RGB]!
- .elseif \size == 4
- vst1.16 {d30}, [RGB]!
- .elseif \size == 2
- vst1.16 {d31[0]}, [RGB]!
- vst1.16 {d31[1]}, [RGB]!
- .elseif \size == 1
- vst1.16 {d31[2]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .else
- .error unsupported bpp
- .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
- vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
- vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
- vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
- vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
- vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
- vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
- vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
- vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
- vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
- vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
- vrshrn.s32 d20, q10, #15
- vrshrn.s32 d21, q11, #15
- vrshrn.s32 d24, q12, #14
- vrshrn.s32 d25, q13, #14
- vrshrn.s32 d28, q14, #14
- vrshrn.s32 d29, q15, #14
- vaddw.u8 q11, q10, d0
- vaddw.u8 q12, q12, d0
- vaddw.u8 q14, q14, d0
- vqshlu.s16 q13, q11, #8
- vqshlu.s16 q15, q12, #8
- vqshlu.s16 q14, q14, #8
- vsri.u16 q15, q13, #5
- vsri.u16 q15, q14, #11
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1
- /* "do_yuv_to_rgb_stage2" and "store" */
- vrshrn.s32 d20, q10, #15
- /* "load" and "do_yuv_to_rgb_stage1" */
- pld [U, #64]
- vrshrn.s32 d21, q11, #15
- pld [V, #64]
- vrshrn.s32 d24, q12, #14
- vrshrn.s32 d25, q13, #14
- vld1.8 {d4}, [U, :64]!
- vrshrn.s32 d28, q14, #14
- vld1.8 {d5}, [V, :64]!
- vrshrn.s32 d29, q15, #14
- vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
- vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
- vaddw.u8 q11, q10, d0
- vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
- vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
- vaddw.u8 q12, q12, d0
- vaddw.u8 q14, q14, d0
- vqshlu.s16 q13, q11, #8
- pld [Y, #64]
- vqshlu.s16 q15, q12, #8
- vqshlu.s16 q14, q14, #8
- vld1.8 {d0}, [Y, :64]!
- vmull.s16 q11, d7, d1[1]
- vmlal.s16 q11, d9, d1[2]
- vsri.u16 q15, q13, #5
- vmull.s16 q12, d8, d1[0]
- vsri.u16 q15, q14, #11
- vmull.s16 q13, d9, d1[0]
- vmull.s16 q14, d6, d1[3]
- do_store \bpp, 8
- vmull.s16 q15, d7, d1[3]
-.endm
-
-.macro do_yuv_to_rgb
- do_yuv_to_rgb_stage1
- do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-jsimd_ycc_\colorid\()_neon_consts:
- .short 0, 0, 0, 0
- .short 22971, -11277, -23401, 29033
- .short -128, -128, -128, -128
- .short -128, -128, -128, -128
-
-asm_function jsimd_ycc_\colorid\()_convert_neon
- OUTPUT_WIDTH .req r0
- INPUT_BUF .req r1
- INPUT_ROW .req r2
- OUTPUT_BUF .req r3
- NUM_ROWS .req r4
-
- INPUT_BUF0 .req r5
- INPUT_BUF1 .req r6
- INPUT_BUF2 .req INPUT_BUF
-
- RGB .req r7
- Y .req r8
- U .req r9
- V .req r10
- N .req ip
-
- /* Load constants to d1, d2, d3 (d0 is just used for padding) */
- adr ip, jsimd_ycc_\colorid\()_neon_consts
- vld1.16 {d0, d1, d2, d3}, [ip, :128]
-
- /* Save ARM registers and handle input arguments */
- push {r4, r5, r6, r7, r8, r9, r10, lr}
- ldr NUM_ROWS, [sp, #(4 * 8)]
- ldr INPUT_BUF0, [INPUT_BUF]
- ldr INPUT_BUF1, [INPUT_BUF, #4]
- ldr INPUT_BUF2, [INPUT_BUF, #8]
- .unreq INPUT_BUF
-
- /* Save NEON registers */
- vpush {d8-d15}
-
- /* Initially set d10, d11, d12, d13 to 0xFF */
- vmov.u8 q5, #255
- vmov.u8 q6, #255
-
- /* Outer loop over scanlines */
- cmp NUM_ROWS, #1
- blt 9f
-0:
- ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
- ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
- mov N, OUTPUT_WIDTH
- ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
- add INPUT_ROW, INPUT_ROW, #1
- ldr RGB, [OUTPUT_BUF], #4
-
- /* Inner loop over pixels */
- subs N, N, #8
- blt 3f
- do_load 8
- do_yuv_to_rgb_stage1
- subs N, N, #8
- blt 2f
-1:
- do_yuv_to_rgb_stage2_store_load_stage1
- subs N, N, #8
- bge 1b
-2:
- do_yuv_to_rgb_stage2
- do_store \bpp, 8
- tst N, #7
- beq 8f
-3:
- tst N, #4
- beq 3f
- do_load 4
-3:
- tst N, #2
- beq 4f
- do_load 2
-4:
- tst N, #1
- beq 5f
- do_load 1
-5:
- do_yuv_to_rgb
- tst N, #4
- beq 6f
- do_store \bpp, 4
-6:
- tst N, #2
- beq 7f
- do_store \bpp, 2
-7:
- tst N, #1
- beq 8f
- do_store \bpp, 1
-8:
- subs NUM_ROWS, NUM_ROWS, #1
- bgt 0b
-9:
- /* Restore all registers and return */
- vpop {d8-d15}
- pop {r4, r5, r6, r7, r8, r9, r10, pc}
-
- .unreq OUTPUT_WIDTH
- .unreq INPUT_ROW
- .unreq OUTPUT_BUF
- .unreq NUM_ROWS
- .unreq INPUT_BUF0
- .unreq INPUT_BUF1
- .unreq INPUT_BUF2
- .unreq RGB
- .unreq Y
- .unreq U
- .unreq V
- .unreq N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R G B */
-generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
* jsimd_extrgb_ycc_convert_neon
* jsimd_extbgr_ycc_convert_neon
* jsimd_extrgbx_ycc_convert_neon
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index d0b9a32..3a1d1ef 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -1430,295 +1430,6 @@
/*****************************************************************************/
/*
- * jsimd_ycc_rgb565_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB565
- */
-
-.macro do_load size
- .if \size == 8
- ld1 {v4.8b}, [U], 8
- ld1 {v5.8b}, [V], 8
- ld1 {v0.8b}, [Y], 8
- prfm pldl1keep, [U, #64]
- prfm pldl1keep, [V, #64]
- prfm pldl1keep, [Y, #64]
- .elseif \size == 4
- ld1 {v4.b}[0], [U], 1
- ld1 {v4.b}[1], [U], 1
- ld1 {v4.b}[2], [U], 1
- ld1 {v4.b}[3], [U], 1
- ld1 {v5.b}[0], [V], 1
- ld1 {v5.b}[1], [V], 1
- ld1 {v5.b}[2], [V], 1
- ld1 {v5.b}[3], [V], 1
- ld1 {v0.b}[0], [Y], 1
- ld1 {v0.b}[1], [Y], 1
- ld1 {v0.b}[2], [Y], 1
- ld1 {v0.b}[3], [Y], 1
- .elseif \size == 2
- ld1 {v4.b}[4], [U], 1
- ld1 {v4.b}[5], [U], 1
- ld1 {v5.b}[4], [V], 1
- ld1 {v5.b}[5], [V], 1
- ld1 {v0.b}[4], [Y], 1
- ld1 {v0.b}[5], [Y], 1
- .elseif \size == 1
- ld1 {v4.b}[6], [U], 1
- ld1 {v5.b}[6], [V], 1
- ld1 {v0.b}[6], [Y], 1
- .else
- .error unsupported macroblock size
- .endif
-.endm
-
-.macro do_store bpp, size, fast_st3
- .if \bpp == 16
- .if \size == 8
- st1 {v25.8h}, [RGB], 16
- .elseif \size == 4
- st1 {v25.4h}, [RGB], 8
- .elseif \size == 2
- st1 {v25.h}[4], [RGB], 2
- st1 {v25.h}[5], [RGB], 2
- .elseif \size == 1
- st1 {v25.h}[6], [RGB], 2
- .else
- .error unsupported macroblock size
- .endif
- .else
- .error unsupported bpp
- .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
- g_offs, gsize, b_offs, bsize, \
- defsize, fast_st3
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
- uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
- uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
- smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
- smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
- smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
- smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
- smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
- smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
- smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
- smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
- rshrn v20.4h, v20.4s, #15
- rshrn2 v20.8h, v22.4s, #15
- rshrn v24.4h, v24.4s, #14
- rshrn2 v24.8h, v26.4s, #14
- rshrn v28.4h, v28.4s, #14
- rshrn2 v28.8h, v30.4s, #14
- uaddw v20.8h, v20.8h, v0.8b
- uaddw v24.8h, v24.8h, v0.8b
- uaddw v28.8h, v28.8h, v0.8b
- sqshlu v21.8h, v20.8h, #8
- sqshlu v25.8h, v24.8h, #8
- sqshlu v29.8h, v28.8h, #8
- sri v25.8h, v21.8h, #5
- sri v25.8h, v29.8h, #11
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
- rshrn v20.4h, v20.4s, #15
- rshrn v24.4h, v24.4s, #14
- rshrn v28.4h, v28.4s, #14
- ld1 {v4.8b}, [U], 8
- rshrn2 v20.8h, v22.4s, #15
- rshrn2 v24.8h, v26.4s, #14
- rshrn2 v28.8h, v30.4s, #14
- ld1 {v5.8b}, [V], 8
- uaddw v20.8h, v20.8h, v0.8b
- uaddw v24.8h, v24.8h, v0.8b
- uaddw v28.8h, v28.8h, v0.8b
- sqshlu v21.8h, v20.8h, #8
- sqshlu v25.8h, v24.8h, #8
- sqshlu v29.8h, v28.8h, #8
- uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
- uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
- ld1 {v0.8b}, [Y], 8
- smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
- smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
- smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
- smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
- sri v25.8h, v21.8h, #5
- smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
- smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
- prfm pldl1keep, [U, #64]
- prfm pldl1keep, [V, #64]
- prfm pldl1keep, [Y, #64]
- sri v25.8h, v29.8h, #11
- do_store \bpp, 8, \fast_st3
- smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
- smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb
- do_yuv_to_rgb_stage1
- do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-.if \fast_st3 == 1
-Ljsimd_ycc_\colorid\()_neon_consts:
-.else
-Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
-.endif
- .short 0, 0, 0, 0
- .short 22971, -11277, -23401, 29033
- .short -128, -128, -128, -128
- .short -128, -128, -128, -128
-
-.if \fast_st3 == 1
-asm_function jsimd_ycc_\colorid\()_convert_neon
-.else
-asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
-.endif
- OUTPUT_WIDTH .req w0
- INPUT_BUF .req x1
- INPUT_ROW .req w2
- OUTPUT_BUF .req x3
- NUM_ROWS .req w4
-
- INPUT_BUF0 .req x5
- INPUT_BUF1 .req x6
- INPUT_BUF2 .req x1
-
- RGB .req x7
- Y .req x9
- U .req x10
- V .req x11
- N .req w15
-
- sub sp, sp, 64
- mov x9, sp
-
- /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
- .if \fast_st3 == 1
- adr x15, Ljsimd_ycc_\colorid\()_neon_consts
- .else
- adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
- .endif
-
- /* Save NEON registers */
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
- ld1 {v0.4h, v1.4h}, [x15], 16
- ld1 {v2.8h}, [x15]
-
- ldr INPUT_BUF0, [INPUT_BUF]
- ldr INPUT_BUF1, [INPUT_BUF, #8]
- ldr INPUT_BUF2, [INPUT_BUF, #16]
- .unreq INPUT_BUF
-
- /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
- movi v10.16b, #255
- movi v13.16b, #255
-
- /* Outer loop over scanlines */
- cmp NUM_ROWS, #1
- b.lt 9f
-0:
- ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
- ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
- mov N, OUTPUT_WIDTH
- ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
- add INPUT_ROW, INPUT_ROW, #1
- ldr RGB, [OUTPUT_BUF], #8
-
- /* Inner loop over pixels */
- subs N, N, #8
- b.lt 3f
- do_load 8
- do_yuv_to_rgb_stage1
- subs N, N, #8
- b.lt 2f
-1:
- do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
- subs N, N, #8
- b.ge 1b
-2:
- do_yuv_to_rgb_stage2
- do_store \bpp, 8, \fast_st3
- tst N, #7
- b.eq 8f
-3:
- tst N, #4
- b.eq 3f
- do_load 4
-3:
- tst N, #2
- b.eq 4f
- do_load 2
-4:
- tst N, #1
- b.eq 5f
- do_load 1
-5:
- do_yuv_to_rgb
- tst N, #4
- b.eq 6f
- do_store \bpp, 4, \fast_st3
-6:
- tst N, #2
- b.eq 7f
- do_store \bpp, 2, \fast_st3
-7:
- tst N, #1
- b.eq 8f
- do_store \bpp, 1, \fast_st3
-8:
- subs NUM_ROWS, NUM_ROWS, #1
- b.gt 0b
-9:
- /* Restore all registers and return */
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- br x30
- .unreq OUTPUT_WIDTH
- .unreq INPUT_ROW
- .unreq OUTPUT_BUF
- .unreq NUM_ROWS
- .unreq INPUT_BUF0
- .unreq INPUT_BUF1
- .unreq INPUT_BUF2
- .unreq RGB
- .unreq Y
- .unreq U
- .unreq V
- .unreq N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/
-generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
* jsimd_extrgb_ycc_convert_neon
* jsimd_extbgr_ycc_convert_neon
* jsimd_extrgbx_ycc_convert_neon
diff --git a/simd/arm/common/jdcolext-neon.c b/simd/arm/common/jdcolext-neon.c
index 61a0ff8..b201792 100644
--- a/simd/arm/common/jdcolext-neon.c
+++ b/simd/arm/common/jdcolext-neon.c
@@ -116,7 +116,7 @@
int16x8_t g_h = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h), vget_high_u8(y)));
-#ifdef RGB_ALPHA
+#if RGB_PIXELSIZE == 4
uint8x16x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
@@ -126,7 +126,7 @@
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4q_u8(outptr, rgba);
-#else
+#elif RGB_PIXELSIZE == 3
uint8x16x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
@@ -134,7 +134,19 @@
rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
/* Store RGB pixel data to memory. */
vst3q_u8(outptr, rgb);
-#endif
+#else /* RGB565 */
+ /* Pack R, G and B values in ratio 5:6:5. */
+ uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+ rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+ rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+ uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+ rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+ rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+ /* Store RGB pixel data to memory. */
+ vst1q_u16((uint16_t *)outptr, rgb565_l);
+ vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif /* RGB565 */
+
/* Increment pointers. */
inptr0 += 16;
inptr1 += 16;
@@ -171,7 +183,7 @@
int16x8_t g = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
-#ifdef RGB_ALPHA
+#if RGB_PIXELSIZE == 4
uint8x8x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vqmovun_s16(r);
@@ -181,7 +193,7 @@
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4_u8(outptr, rgba);
-#else
+#elif RGB_PIXELSIZE == 3
uint8x8x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vqmovun_s16(r);
@@ -189,7 +201,15 @@
rgb.val[RGB_BLUE] = vqmovun_s16(b);
/* Store RGB pixel data to memory. */
vst3_u8(outptr, rgb);
-#endif
+#else /* RGB565 */
+ /* Pack R, G and B values in ratio 5:6:5. */
+ uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+ /* Store RGB pixel data to memory. */
+ vst1q_u16((uint16_t *)outptr, rgb565);
+#endif /* RGB565 */
+
/* Increment pointers. */
inptr0 += 8;
inptr1 += 8;
@@ -228,7 +248,7 @@
int16x8_t g = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
-#ifdef RGB_ALPHA
+#if RGB_PIXELSIZE == 4
uint8x8x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vqmovun_s16(r);
@@ -255,7 +275,7 @@
default:
break;
}
-#else
+#elif RGB_PIXELSIZE == 3
uint8x8x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vqmovun_s16(r);
@@ -280,7 +300,31 @@
default:
break;
}
-#endif
+#else /* RGB565 */
+ /* Pack R, G and B values in ratio 5:6:5. */
+ uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+ /* Store RGB565 pixel data to memory. */
+ switch (cols_remaining) {
+ case 7 :
+ vst1q_lane_u16(outptr + 6 * RGB_PIXELSIZE, rgb565, 6);
+ case 6 :
+ vst1q_lane_u16(outptr + 5 * RGB_PIXELSIZE, rgb565, 5);
+ case 5 :
+ vst1q_lane_u16(outptr + 4 * RGB_PIXELSIZE, rgb565, 4);
+ case 4 :
+ vst1q_lane_u16(outptr + 3 * RGB_PIXELSIZE, rgb565, 3);
+ case 3 :
+ vst1q_lane_u16(outptr + 2 * RGB_PIXELSIZE, rgb565, 2);
+ case 2 :
+ vst1q_lane_u16(outptr + RGB_PIXELSIZE, rgb565, 1);
+ case 1 :
+ vst1q_lane_u16(outptr, rgb565, 0);
+ default:
+ break;
+ }
+#endif /* RGB565 */
}
}
}
diff --git a/simd/arm/common/jdcolor-neon.c b/simd/arm/common/jdcolor-neon.c
index 4a7e427..52dab1e 100644
--- a/simd/arm/common/jdcolor-neon.c
+++ b/simd/arm/common/jdcolor-neon.c
@@ -124,3 +124,11 @@
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion. */
+
+#define RGB_PIXELSIZE 2
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon