Implement YCbCr->RGB565 using Arm NEON intrinsics

Adds an Arm NEON intrinsics implementation of YCbCr -> RGB565 color
conversion.

Removes both the AArch32 and AArch64 NEON assembly implementations.

Bug: 922430
Change-Id: I3955b9b20bdddc7c9cefe8a8b2a54a0534e58dd5
diff --git a/README.chromium b/README.chromium
index 7e98312..34b867e 100644
--- a/README.chromium
+++ b/README.chromium
@@ -59,6 +59,7 @@
   - Add Arm NEON implementation of h2v1_upsample
   - Add Arm NEON implementation of h2v2_upsample
   - Implement YCbCr->RGB using Arm NEON intrinsics
+  - Implement YCbCr->RGB565 using Arm NEON intrinsics
 
 Refer to working-with-nested-repos [1] for details of how to setup your git
 svn client to update the code (for making local changes, cherry picking from
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
index a5b736b..0e05819 100644
--- a/simd/arm/arm/jsimd_neon.S
+++ b/simd/arm/arm/jsimd_neon.S
@@ -1270,282 +1270,6 @@
 /*****************************************************************************/
 
 /*
- * jsimd_ycc_rgb565_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB565
- */
-
-
-.macro do_load size
-  .if \size == 8
-    vld1.8          {d4}, [U, :64]!
-    vld1.8          {d5}, [V, :64]!
-    vld1.8          {d0}, [Y, :64]!
-    pld             [U, #64]
-    pld             [V, #64]
-    pld             [Y, #64]
-  .elseif \size == 4
-    vld1.8          {d4[0]}, [U]!
-    vld1.8          {d4[1]}, [U]!
-    vld1.8          {d4[2]}, [U]!
-    vld1.8          {d4[3]}, [U]!
-    vld1.8          {d5[0]}, [V]!
-    vld1.8          {d5[1]}, [V]!
-    vld1.8          {d5[2]}, [V]!
-    vld1.8          {d5[3]}, [V]!
-    vld1.8          {d0[0]}, [Y]!
-    vld1.8          {d0[1]}, [Y]!
-    vld1.8          {d0[2]}, [Y]!
-    vld1.8          {d0[3]}, [Y]!
-  .elseif \size == 2
-    vld1.8          {d4[4]}, [U]!
-    vld1.8          {d4[5]}, [U]!
-    vld1.8          {d5[4]}, [V]!
-    vld1.8          {d5[5]}, [V]!
-    vld1.8          {d0[4]}, [Y]!
-    vld1.8          {d0[5]}, [Y]!
-  .elseif \size == 1
-    vld1.8          {d4[6]}, [U]!
-    vld1.8          {d5[6]}, [V]!
-    vld1.8          {d0[6]}, [Y]!
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_store bpp, size
-  .if \bpp == 16
-    .if \size == 8
-      vst1.16       {q15}, [RGB]!
-    .elseif \size == 4
-      vst1.16       {d30}, [RGB]!
-    .elseif \size == 2
-      vst1.16       {d31[0]}, [RGB]!
-      vst1.16       {d31[1]}, [RGB]!
-    .elseif \size == 1
-      vst1.16       {d31[2]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
-    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
-    vrshrn.s32      d20, q10, #15
-    vrshrn.s32      d21, q11, #15
-    vrshrn.s32      d24, q12, #14
-    vrshrn.s32      d25, q13, #14
-    vrshrn.s32      d28, q14, #14
-    vrshrn.s32      d29, q15, #14
-    vaddw.u8        q11, q10, d0
-    vaddw.u8        q12, q12, d0
-    vaddw.u8        q14, q14, d0
-    vqshlu.s16      q13, q11, #8
-    vqshlu.s16      q15, q12, #8
-    vqshlu.s16      q14, q14, #8
-    vsri.u16        q15, q13, #5
-    vsri.u16        q15, q14, #11
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1
-                                       /* "do_yuv_to_rgb_stage2" and "store" */
-                                       vrshrn.s32      d20, q10, #15
-    /* "load" and "do_yuv_to_rgb_stage1" */
-    pld             [U, #64]
-                                       vrshrn.s32      d21, q11, #15
-    pld             [V, #64]
-                                       vrshrn.s32      d24, q12, #14
-                                       vrshrn.s32      d25, q13, #14
-    vld1.8          {d4}, [U, :64]!
-                                       vrshrn.s32      d28, q14, #14
-    vld1.8          {d5}, [V, :64]!
-                                       vrshrn.s32      d29, q15, #14
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-                                       vaddw.u8        q11, q10, d0
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-                                       vaddw.u8        q12, q12, d0
-                                       vaddw.u8        q14, q14, d0
-                                       vqshlu.s16      q13, q11, #8
-    pld             [Y, #64]
-                                       vqshlu.s16      q15, q12, #8
-                                       vqshlu.s16      q14, q14, #8
-    vld1.8          {d0}, [Y, :64]!
-    vmull.s16       q11, d7, d1[1]
-    vmlal.s16       q11, d9, d1[2]
-                                       vsri.u16        q15, q13, #5
-    vmull.s16       q12, d8, d1[0]
-                                       vsri.u16        q15, q14, #11
-    vmull.s16       q13, d9, d1[0]
-    vmull.s16       q14, d6, d1[3]
-                                       do_store        \bpp, 8
-    vmull.s16       q15, d7, d1[3]
-.endm
-
-.macro do_yuv_to_rgb
-    do_yuv_to_rgb_stage1
-    do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-jsimd_ycc_\colorid\()_neon_consts:
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
-asm_function jsimd_ycc_\colorid\()_convert_neon
-    OUTPUT_WIDTH    .req r0
-    INPUT_BUF       .req r1
-    INPUT_ROW       .req r2
-    OUTPUT_BUF      .req r3
-    NUM_ROWS        .req r4
-
-    INPUT_BUF0      .req r5
-    INPUT_BUF1      .req r6
-    INPUT_BUF2      .req INPUT_BUF
-
-    RGB             .req r7
-    Y               .req r8
-    U               .req r9
-    V               .req r10
-    N               .req ip
-
-    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
-    adr             ip, jsimd_ycc_\colorid\()_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]
-
-    /* Save ARM registers and handle input arguments */
-    push            {r4, r5, r6, r7, r8, r9, r10, lr}
-    ldr             NUM_ROWS, [sp, #(4 * 8)]
-    ldr             INPUT_BUF0, [INPUT_BUF]
-    ldr             INPUT_BUF1, [INPUT_BUF, #4]
-    ldr             INPUT_BUF2, [INPUT_BUF, #8]
-    .unreq          INPUT_BUF
-
-    /* Save NEON registers */
-    vpush           {d8-d15}
-
-    /* Initially set d10, d11, d12, d13 to 0xFF */
-    vmov.u8         q5, #255
-    vmov.u8         q6, #255
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    blt             9f
-0:
-    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
-    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
-    add             INPUT_ROW, INPUT_ROW, #1
-    ldr             RGB, [OUTPUT_BUF], #4
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    blt             3f
-    do_load         8
-    do_yuv_to_rgb_stage1
-    subs            N, N, #8
-    blt             2f
-1:
-    do_yuv_to_rgb_stage2_store_load_stage1
-    subs            N, N, #8
-    bge             1b
-2:
-    do_yuv_to_rgb_stage2
-    do_store        \bpp, 8
-    tst             N, #7
-    beq             8f
-3:
-    tst             N, #4
-    beq             3f
-    do_load         4
-3:
-    tst             N, #2
-    beq             4f
-    do_load         2
-4:
-    tst             N, #1
-    beq             5f
-    do_load         1
-5:
-    do_yuv_to_rgb
-    tst             N, #4
-    beq             6f
-    do_store        \bpp, 4
-6:
-    tst             N, #2
-    beq             7f
-    do_store        \bpp, 2
-7:
-    tst             N, #1
-    beq             8f
-    do_store        \bpp, 1
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
-9:
-    /* Restore all registers and return */
-    vpop            {d8-d15}
-    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
-
-    .unreq          OUTPUT_WIDTH
-    .unreq          INPUT_ROW
-    .unreq          OUTPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          INPUT_BUF0
-    .unreq          INPUT_BUF1
-    .unreq          INPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  G  B */
-generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
  * jsimd_extrgb_ycc_convert_neon
  * jsimd_extbgr_ycc_convert_neon
  * jsimd_extrgbx_ycc_convert_neon
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index d0b9a32..3a1d1ef 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -1430,295 +1430,6 @@
 /*****************************************************************************/
 
 /*
- * jsimd_ycc_rgb565_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB565
- */
-
-.macro do_load size
-  .if \size == 8
-    ld1             {v4.8b}, [U], 8
-    ld1             {v5.8b}, [V], 8
-    ld1             {v0.8b}, [Y], 8
-    prfm            pldl1keep, [U, #64]
-    prfm            pldl1keep, [V, #64]
-    prfm            pldl1keep, [Y, #64]
-  .elseif \size == 4
-    ld1             {v4.b}[0], [U], 1
-    ld1             {v4.b}[1], [U], 1
-    ld1             {v4.b}[2], [U], 1
-    ld1             {v4.b}[3], [U], 1
-    ld1             {v5.b}[0], [V], 1
-    ld1             {v5.b}[1], [V], 1
-    ld1             {v5.b}[2], [V], 1
-    ld1             {v5.b}[3], [V], 1
-    ld1             {v0.b}[0], [Y], 1
-    ld1             {v0.b}[1], [Y], 1
-    ld1             {v0.b}[2], [Y], 1
-    ld1             {v0.b}[3], [Y], 1
-  .elseif \size == 2
-    ld1             {v4.b}[4], [U], 1
-    ld1             {v4.b}[5], [U], 1
-    ld1             {v5.b}[4], [V], 1
-    ld1             {v5.b}[5], [V], 1
-    ld1             {v0.b}[4], [Y], 1
-    ld1             {v0.b}[5], [Y], 1
-  .elseif \size == 1
-    ld1             {v4.b}[6], [U], 1
-    ld1             {v5.b}[6], [V], 1
-    ld1             {v0.b}[6], [Y], 1
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_store bpp, size, fast_st3
-  .if \bpp == 16
-    .if \size == 8
-      st1           {v25.8h}, [RGB], 16
-    .elseif \size == 4
-      st1           {v25.4h}, [RGB], 8
-    .elseif \size == 2
-      st1           {v25.h}[4], [RGB], 2
-      st1           {v25.h}[5], [RGB], 2
-    .elseif \size == 1
-      st1           {v25.h}[6], [RGB], 2
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
-                                           g_offs, gsize, b_offs, bsize, \
-                                           defsize, fast_st3
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
-    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
-    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
-    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
-    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
-    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
-    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
-    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
-    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
-    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
-    rshrn           v20.4h, v20.4s, #15
-    rshrn2          v20.8h, v22.4s, #15
-    rshrn           v24.4h, v24.4s, #14
-    rshrn2          v24.8h, v26.4s, #14
-    rshrn           v28.4h, v28.4s, #14
-    rshrn2          v28.8h, v30.4s, #14
-    uaddw           v20.8h, v20.8h, v0.8b
-    uaddw           v24.8h, v24.8h, v0.8b
-    uaddw           v28.8h, v28.8h, v0.8b
-    sqshlu          v21.8h, v20.8h, #8
-    sqshlu          v25.8h, v24.8h, #8
-    sqshlu          v29.8h, v28.8h, #8
-    sri             v25.8h, v21.8h, #5
-    sri             v25.8h, v29.8h, #11
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
-    rshrn           v20.4h, v20.4s, #15
-    rshrn           v24.4h, v24.4s, #14
-    rshrn           v28.4h, v28.4s, #14
-    ld1             {v4.8b}, [U], 8
-    rshrn2          v20.8h, v22.4s, #15
-    rshrn2          v24.8h, v26.4s, #14
-    rshrn2          v28.8h, v30.4s, #14
-    ld1             {v5.8b}, [V], 8
-    uaddw           v20.8h, v20.8h, v0.8b
-    uaddw           v24.8h, v24.8h, v0.8b
-    uaddw           v28.8h, v28.8h, v0.8b
-    sqshlu          v21.8h, v20.8h, #8
-    sqshlu          v25.8h, v24.8h, #8
-    sqshlu          v29.8h, v28.8h, #8
-    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
-    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    ld1             {v0.8b}, [Y], 8
-    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
-    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
-    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
-    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
-    sri             v25.8h, v21.8h, #5
-    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
-    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
-    prfm            pldl1keep, [U, #64]
-    prfm            pldl1keep, [V, #64]
-    prfm            pldl1keep, [Y, #64]
-    sri             v25.8h, v29.8h, #11
-    do_store        \bpp, 8, \fast_st3
-    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
-    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb
-    do_yuv_to_rgb_stage1
-    do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-.if \fast_st3 == 1
-Ljsimd_ycc_\colorid\()_neon_consts:
-.else
-Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
-.endif
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
-.if \fast_st3 == 1
-asm_function jsimd_ycc_\colorid\()_convert_neon
-.else
-asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
-.endif
-    OUTPUT_WIDTH    .req w0
-    INPUT_BUF       .req x1
-    INPUT_ROW       .req w2
-    OUTPUT_BUF      .req x3
-    NUM_ROWS        .req w4
-
-    INPUT_BUF0      .req x5
-    INPUT_BUF1      .req x6
-    INPUT_BUF2      .req x1
-
-    RGB             .req x7
-    Y               .req x9
-    U               .req x10
-    V               .req x11
-    N               .req w15
-
-    sub             sp, sp, 64
-    mov             x9, sp
-
-    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
-    .if \fast_st3 == 1
-      adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
-    .else
-      adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
-    .endif
-
-    /* Save NEON registers */
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v0.4h, v1.4h}, [x15], 16
-    ld1             {v2.8h}, [x15]
-
-    ldr             INPUT_BUF0, [INPUT_BUF]
-    ldr             INPUT_BUF1, [INPUT_BUF, #8]
-    ldr             INPUT_BUF2, [INPUT_BUF, #16]
-    .unreq          INPUT_BUF
-
-    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
-    movi            v10.16b, #255
-    movi            v13.16b, #255
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    b.lt            9f
-0:
-    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
-    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
-    add             INPUT_ROW, INPUT_ROW, #1
-    ldr             RGB, [OUTPUT_BUF], #8
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    b.lt            3f
-    do_load         8
-    do_yuv_to_rgb_stage1
-    subs            N, N, #8
-    b.lt            2f
-1:
-    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
-    subs            N, N, #8
-    b.ge            1b
-2:
-    do_yuv_to_rgb_stage2
-    do_store        \bpp, 8, \fast_st3
-    tst             N, #7
-    b.eq            8f
-3:
-    tst             N, #4
-    b.eq            3f
-    do_load         4
-3:
-    tst             N, #2
-    b.eq            4f
-    do_load         2
-4:
-    tst             N, #1
-    b.eq            5f
-    do_load         1
-5:
-    do_yuv_to_rgb
-    tst             N, #4
-    b.eq            6f
-    do_store        \bpp, 4, \fast_st3
-6:
-    tst             N, #2
-    b.eq            7f
-    do_store        \bpp, 2, \fast_st3
-7:
-    tst             N, #1
-    b.eq            8f
-    do_store        \bpp, 1, \fast_st3
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    b.gt            0b
-9:
-    /* Restore all registers and return */
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    br              x30
-    .unreq          OUTPUT_WIDTH
-    .unreq          INPUT_ROW
-    .unreq          OUTPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          INPUT_BUF0
-    .unreq          INPUT_BUF1
-    .unreq          INPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
-generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
  * jsimd_extrgb_ycc_convert_neon
  * jsimd_extbgr_ycc_convert_neon
  * jsimd_extrgbx_ycc_convert_neon
diff --git a/simd/arm/common/jdcolext-neon.c b/simd/arm/common/jdcolext-neon.c
index 61a0ff8..b201792 100644
--- a/simd/arm/common/jdcolext-neon.c
+++ b/simd/arm/common/jdcolext-neon.c
@@ -116,7 +116,7 @@
       int16x8_t g_h = vreinterpretq_s16_u16(
                 vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h), vget_high_u8(y)));
 
-#ifdef RGB_ALPHA
+#if RGB_PIXELSIZE == 4
       uint8x16x4_t rgba;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
@@ -126,7 +126,7 @@
       rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
       /* Store RGBA pixel data to memory. */
       vst4q_u8(outptr, rgba);
-#else
+#elif RGB_PIXELSIZE == 3
       uint8x16x3_t rgb;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
@@ -134,7 +134,19 @@
       rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
       /* Store RGB pixel data to memory. */
       vst3q_u8(outptr, rgb);
-#endif
+#else /* RGB565 */
+      /* Pack R, G and B values in ratio 5:6:5. */
+      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565_l);
+      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif /* RGB565 */
+
       /* Increment pointers. */
       inptr0 += 16;
       inptr1 += 16;
@@ -171,7 +183,7 @@
       int16x8_t g = vreinterpretq_s16_u16(
                                 vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
 
-#ifdef RGB_ALPHA
+#if RGB_PIXELSIZE == 4
       uint8x8x4_t rgba;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgba.val[RGB_RED] = vqmovun_s16(r);
@@ -181,7 +193,7 @@
       rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
       /* Store RGBA pixel data to memory. */
       vst4_u8(outptr, rgba);
-#else
+#elif RGB_PIXELSIZE == 3
       uint8x8x3_t rgb;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgb.val[RGB_RED] = vqmovun_s16(r);
@@ -189,7 +201,15 @@
       rgb.val[RGB_BLUE] = vqmovun_s16(b);
       /* Store RGB pixel data to memory. */
       vst3_u8(outptr, rgb);
-#endif
+#else /* RGB565 */
+      /* Pack R, G and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565);
+#endif /* RGB565 */
+
       /* Increment pointers. */
       inptr0 += 8;
       inptr1 += 8;
@@ -228,7 +248,7 @@
       int16x8_t g = vreinterpretq_s16_u16(
                                 vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
 
-#ifdef RGB_ALPHA
+#if RGB_PIXELSIZE == 4
       uint8x8x4_t rgba;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgba.val[RGB_RED] = vqmovun_s16(r);
@@ -255,7 +275,7 @@
       default:
         break;
       }
-#else
+#elif RGB_PIXELSIZE == 3
       uint8x8x3_t rgb;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgb.val[RGB_RED] = vqmovun_s16(r);
@@ -280,7 +300,31 @@
       default:
         break;
       }
-#endif
+#else /* RGB565 */
+      /* Pack R, G and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB565 pixel data to memory. */
+      switch (cols_remaining) {
+      case 7 :
+        vst1q_lane_u16(outptr + 6 * RGB_PIXELSIZE, rgb565, 6);
+      case 6 :
+        vst1q_lane_u16(outptr + 5 * RGB_PIXELSIZE, rgb565, 5);
+      case 5 :
+        vst1q_lane_u16(outptr + 4 * RGB_PIXELSIZE, rgb565, 4);
+      case 4 :
+        vst1q_lane_u16(outptr + 3 * RGB_PIXELSIZE, rgb565, 3);
+      case 3 :
+        vst1q_lane_u16(outptr + 2 * RGB_PIXELSIZE, rgb565, 2);
+      case 2 :
+        vst1q_lane_u16(outptr + RGB_PIXELSIZE, rgb565, 1);
+      case 1 :
+        vst1q_lane_u16(outptr, rgb565, 0);
+      default:
+        break;
+      }
+#endif /* RGB565 */
     }
   }
 }
diff --git a/simd/arm/common/jdcolor-neon.c b/simd/arm/common/jdcolor-neon.c
index 4a7e427..52dab1e 100644
--- a/simd/arm/common/jdcolor-neon.c
+++ b/simd/arm/common/jdcolor-neon.c
@@ -124,3 +124,11 @@
 #undef RGB_ALPHA
 #undef RGB_PIXELSIZE
 #undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion. */
+
+#define RGB_PIXELSIZE  2
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon