| /* |
| * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #include "libyuv/row.h" |
| |
| #ifdef __cplusplus |
| namespace libyuv { |
| extern "C" { |
| #endif |
| |
| // Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer |
| // STn over ZIP1+ST1 |
| // Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions. |
| |
| // This module is for GCC Neon armv8 64 bit. |
| #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| |
| // v0.8h: Y |
| // v1.16b: 8U, 8V |
| |
| // Read 8 Y, 4 U and 4 V from 422 |
| #define READYUV422 \ |
| "ldr d0, [%[src_y]], #8 \n" \ |
| "ldr s1, [%[src_u]], #4 \n" \ |
| "ldr s2, [%[src_v]], #4 \n" \ |
| "zip1 v0.16b, v0.16b, v0.16b \n" \ |
| "prfm pldl1keep, [%[src_y], 448] \n" \ |
| "zip1 v1.8b, v1.8b, v1.8b \n" \ |
| "zip1 v2.8b, v2.8b, v2.8b \n" \ |
| "prfm pldl1keep, [%[src_u], 128] \n" \ |
| "prfm pldl1keep, [%[src_v], 128] \n" |
| |
| // Read 8 Y, 4 U and 4 V from 210 |
| #define READYUV210 \ |
| "ldr q2, [%[src_y]], #16 \n" \ |
| "ldr d1, [%[src_u]], #8 \n" \ |
| "ldr d3, [%[src_v]], #8 \n" \ |
| "shl v0.8h, v2.8h, #6 \n" \ |
| "usra v0.8h, v2.8h, #4 \n" \ |
| "prfm pldl1keep, [%[src_y], 448] \n" \ |
| "zip1 v2.8h, v3.8h, v3.8h \n" \ |
| "zip1 v3.8h, v1.8h, v1.8h \n" \ |
| "uqshrn v1.8b, v3.8h, #2 \n" \ |
| "uqshrn2 v1.16b, v2.8h, #2 \n" \ |
| "prfm pldl1keep, [%[src_u], 128] \n" \ |
| "prfm pldl1keep, [%[src_v], 128] \n" |
| |
| // Read 8 Y, 4 U and 4 V interleaved from 210 |
| #define READYUVP210 \ |
| "ldr q0, [%[src_y]], #16 \n" \ |
| "ldr q1, [%[src_uv]], #16 \n" \ |
| "prfm pldl1keep, [%[src_y], 448] \n" \ |
| "tbl v1.16b, {v1.16b}, v2.16b \n" |
| |
| // Read 8 Y, 4 U and 4 V from 212 |
| #define READYUV212 \ |
| "ldr q2, [%[src_y]], #16 \n" \ |
| "ldr d1, [%[src_u]], #8 \n" \ |
| "ldr d3, [%[src_v]], #8 \n" \ |
| "shl v0.8h, v2.8h, #4 \n" \ |
| "usra v0.8h, v2.8h, #8 \n" \ |
| "prfm pldl1keep, [%[src_y], 448] \n" \ |
| "zip1 v2.8h, v3.8h, v3.8h \n" \ |
| "zip1 v3.8h, v1.8h, v1.8h \n" \ |
| "uqshrn v1.8b, v3.8h, #4 \n" \ |
| "uqshrn2 v1.16b, v2.8h, #4 \n" \ |
| "prfm pldl1keep, [%[src_u], 128] \n" \ |
| "prfm pldl1keep, [%[src_v], 128] \n" |
| |
| // Read 8 Y, 8 U and 8 V from 410 |
| #define READYUV410 \ |
| "ldr q1, [%[src_y]], #16 \n" \ |
| "ldr q2, [%[src_u]], #16 \n" \ |
| "ldr q3, [%[src_v]], #16 \n" \ |
| "shl v0.8h, v1.8h, #6 \n" \ |
| "usra v0.8h, v1.8h, #4 \n" \ |
| "prfm pldl1keep, [%[src_y], 448] \n" \ |
| "uqshrn v1.8b, v2.8h, #2 \n" \ |
| "uqshrn2 v1.16b, v3.8h, #2 \n" \ |
| "prfm pldl1keep, [%[src_u], 128] \n" \ |
| "prfm pldl1keep, [%[src_v], 128] \n" |
| |
| // Read 8 Y, 8 U and 8 V interleaved from 410 |
| #define READYUVP410 \ |
| "ldr q0, [%[src_y]], #16 \n" \ |
| "ldp q4, q5, [%[src_uv]], #32 \n" \ |
| "prfm pldl1keep, [%[src_y], 448] \n" \ |
| "tbl v1.16b, {v4.16b, v5.16b}, v2.16b \n" |
| |
| // Read 8 Y, 8 U and 8 V from 444 |
| #define READYUV444 \ |
| "ldr d0, [%[src_y]], #8 \n" \ |
| "ldr d1, [%[src_u]], #8 \n" \ |
| "ldr d2, [%[src_v]], #8 \n" \ |
| "prfm pldl1keep, [%[src_y], 448] \n" \ |
| "prfm pldl1keep, [%[src_u], 448] \n" \ |
| "zip1 v0.16b, v0.16b, v0.16b \n" \ |
| "prfm pldl1keep, [%[src_v], 448] \n" |
| |
| // Read 8 Y |
| #define READYUV400 \ |
| "ldr d0, [%[src_y]], #8 \n" \ |
| "prfm pldl1keep, [%[src_y], 448] \n" \ |
| "zip1 v0.16b, v0.16b, v0.16b \n" |
| |
| static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6, |
| 1, 1, 3, 3, 5, 5, 7, 7}; |
| static const uvec8 kNV12InterleavedTable = {0, 0, 4, 4, 8, 8, 12, 12, |
| 2, 2, 6, 6, 10, 10, 14, 14}; |
| static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7, |
| 0, 0, 2, 2, 4, 4, 6, 6}; |
| static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13, |
| 3, 3, 7, 7, 11, 11, 15, 15}; |
| |
| // Read 8 Y and 4 UV from NV12 or NV21 |
| #define READNV12 \ |
| "ldr d0, [%[src_y]], #8 \n" \ |
| "ldr d1, [%[src_uv]], #8 \n" \ |
| "zip1 v0.16b, v0.16b, v0.16b \n" \ |
| "prfm pldl1keep, [%[src_y], 448] \n" \ |
| "tbl v1.16b, {v1.16b}, v2.16b \n" \ |
| "prfm pldl1keep, [%[src_uv], 448] \n" |
| |
| // Read 8 YUY2 |
| #define READYUY2 \ |
| "ld1 {v3.16b}, [%[src_yuy2]], #16 \n" \ |
| "trn1 v0.16b, v3.16b, v3.16b \n" \ |
| "prfm pldl1keep, [%[src_yuy2], 448] \n" \ |
| "tbl v1.16b, {v3.16b}, v2.16b \n" |
| |
| // Read 8 UYVY |
| #define READUYVY \ |
| "ld1 {v3.16b}, [%[src_uyvy]], #16 \n" \ |
| "trn2 v0.16b, v3.16b, v3.16b \n" \ |
| "prfm pldl1keep, [%[src_uyvy], 448] \n" \ |
| "tbl v1.16b, {v3.16b}, v2.16b \n" |
| |
| // UB VR UG VG |
| // YG BB BG BR |
| #define YUVTORGB_SETUP \ |
| "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \ |
| "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n" |
| |
| // v16.8h: B |
| // v17.8h: G |
| // v18.8h: R |
| |
| // Convert from YUV (NV12 or NV21) to 2.14 fixed point RGB. |
| // Similar to I4XXTORGB but U/V components are in the low/high halves of v1. |
| #define NVTORGB \ |
| "umull2 v3.4s, v0.8h, v24.8h \n" \ |
| "umull v6.8h, v1.8b, v30.8b \n" \ |
| "umull v0.4s, v0.4h, v24.4h \n" \ |
| "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \ |
| "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \ |
| "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \ |
| "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \ |
| "add v17.8h, v0.8h, v26.8h \n" /* G */ \ |
| "add v16.8h, v0.8h, v4.8h \n" /* B */ \ |
| "add v18.8h, v0.8h, v5.8h \n" /* R */ \ |
| "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ |
| "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ |
| "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ |
| |
| // Convert from YUV (I444 or I420) to 2.14 fixed point RGB. |
| // Similar to NVTORGB but U/V components are in v1/v2. |
| #define I4XXTORGB \ |
| "umull2 v3.4s, v0.8h, v24.8h \n" \ |
| "umull v6.8h, v1.8b, v30.8b \n" \ |
| "umull v0.4s, v0.4h, v24.4h \n" \ |
| "umlal v6.8h, v2.8b, v31.8b \n" /* DG */ \ |
| "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \ |
| "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \ |
| "umull v5.8h, v2.8b, v29.8b \n" /* DR */ \ |
| "add v17.8h, v0.8h, v26.8h \n" /* G */ \ |
| "add v16.8h, v0.8h, v4.8h \n" /* B */ \ |
| "add v18.8h, v0.8h, v5.8h \n" /* R */ \ |
| "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ |
| "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ |
| "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ |
| |
| // Convert from YUV I400 to 2.14 fixed point RGB |
| #define I400TORGB \ |
| "umull2 v3.4s, v0.8h, v24.8h \n" \ |
| "umull v0.4s, v0.4h, v24.4h \n" \ |
| "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \ |
| "add v17.8h, v0.8h, v26.8h \n" /* G */ \ |
| "add v16.8h, v0.8h, v4.8h \n" /* B */ \ |
| "add v18.8h, v0.8h, v5.8h \n" /* R */ \ |
| "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ |
| "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ |
| "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ |
| |
| // Convert from 2.14 fixed point RGB To 8 bit RGB |
| #define RGBTORGB8 \ |
| "uqshrn v17.8b, v17.8h, #6 \n" \ |
| "uqshrn v16.8b, v16.8h, #6 \n" \ |
| "uqshrn v18.8b, v18.8h, #6 \n" |
| |
| // Convert from 2.14 fixed point RGB to 8 bit RGB, placing the results in the |
| // top half of each lane. |
| #define RGBTORGB8_TOP \ |
| "uqshl v17.8h, v17.8h, #2 \n" \ |
| "uqshl v16.8h, v16.8h, #2 \n" \ |
| "uqshl v18.8h, v18.8h, #2 \n" |
| |
| // Store 2.14 fixed point RGB as AR30 elements |
| #define STOREAR30 \ |
| /* Inputs: \ |
| * v16.8h: xxbbbbbbbbbbxxxx \ |
| * v17.8h: xxggggggggggxxxx \ |
| * v18.8h: xxrrrrrrrrrrxxxx \ |
| * v22.8h: 0011111111110000 (umin limit) \ |
| * v23.8h: 1100000000000000 (alpha) \ |
| */ \ |
| "uqshl v0.8h, v16.8h, #2 \n" /* bbbbbbbbbbxxxxxx */ \ |
| "uqshl v1.8h, v17.8h, #2 \n" /* ggggggggggxxxxxx */ \ |
| "umin v6.8h, v18.8h, v22.8h \n" /* 00rrrrrrrrrrxxxx */ \ |
| "shl v4.8h, v1.8h, #4 \n" /* ggggggxxxxxx0000 */ \ |
| "orr v5.16b, v6.16b, v23.16b \n" /* 11rrrrrrrrrrxxxx */ \ |
| "sri v4.8h, v0.8h, #6 \n" /* ggggggbbbbbbbbbb */ \ |
| "sri v5.8h, v1.8h, #12 \n" /* 11rrrrrrrrrrgggg */ \ |
| "st2 {v4.8h, v5.8h}, [%[dst_ar30]], #32 \n" |
| |
| #define YUVTORGB_REGS \ |
| "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", \ |
| "v25", "v26", "v27", "v28", "v29", "v30", "v31" |
| |
| void I444ToARGBRow_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" /* A */ |
| "1: \n" READYUV444 I4XXTORGB |
| RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| void I444ToRGB24Row_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_rgb24, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "1: \n" READYUV444 I4XXTORGB |
| RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS); |
| } |
| |
| void I210ToAR30Row_NEON(const uint16_t* src_y, |
| const uint16_t* src_u, |
| const uint16_t* src_v, |
| uint8_t* dst_ar30, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| const uvec8* uv_coeff = &yuvconstants->kUVCoeff; |
| const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; |
| uint16_t limit = 0x3ff0; |
| uint16_t alpha = 0xc000; |
| asm volatile (YUVTORGB_SETUP |
| "dup v22.8h, %w[limit] \n" |
| "dup v23.8h, %w[alpha] \n" |
| "1: \n" READYUV210 NVTORGB |
| "subs %w[width], %w[width], #8 \n" STOREAR30 |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] |
| [limit] "r"(limit), // %[limit] |
| [alpha] "r"(alpha) // %[alpha] |
| : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); |
| } |
| |
| void I410ToAR30Row_NEON(const uint16_t* src_y, |
| const uint16_t* src_u, |
| const uint16_t* src_v, |
| uint8_t* dst_ar30, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| const uvec8* uv_coeff = &yuvconstants->kUVCoeff; |
| const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; |
| uint16_t limit = 0x3ff0; |
| uint16_t alpha = 0xc000; |
| asm volatile (YUVTORGB_SETUP |
| "dup v22.8h, %w[limit] \n" |
| "dup v23.8h, %w[alpha] \n" |
| "1: \n" READYUV410 NVTORGB |
| "subs %w[width], %w[width], #8 \n" STOREAR30 |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] |
| [limit] "r"(limit), // %[limit] |
| [alpha] "r"(alpha) // %[alpha] |
| : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); |
| } |
| |
| void I212ToAR30Row_NEON(const uint16_t* src_y, |
| const uint16_t* src_u, |
| const uint16_t* src_v, |
| uint8_t* dst_ar30, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| const uvec8* uv_coeff = &yuvconstants->kUVCoeff; |
| const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; |
| const uint16_t limit = 0x3ff0; |
| asm volatile ( |
| YUVTORGB_SETUP |
| "dup v22.8h, %w[limit] \n" |
| "movi v23.8h, #0xc0, lsl #8 \n" // A |
| "1: \n" READYUV212 NVTORGB |
| "subs %w[width], %w[width], #8 \n" STOREAR30 |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] |
| [limit] "r"(limit) // %[limit] |
| : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); |
| } |
| |
| void I210ToARGBRow_NEON(const uint16_t* src_y, |
| const uint16_t* src_u, |
| const uint16_t* src_v, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile (YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" |
| "1: \n" READYUV210 NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| void I410ToARGBRow_NEON(const uint16_t* src_y, |
| const uint16_t* src_u, |
| const uint16_t* src_v, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile (YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" |
| "1: \n" READYUV410 NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| void I212ToARGBRow_NEON(const uint16_t* src_y, |
| const uint16_t* src_u, |
| const uint16_t* src_v, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| const uvec8* uv_coeff = &yuvconstants->kUVCoeff; |
| const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; |
| asm volatile ( |
| YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" |
| "1: \n" READYUV212 NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(rgb_coeff) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| void I422ToARGBRow_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" /* A */ |
| "1: \n" READYUV422 I4XXTORGB |
| RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| uint8_t kP210LoadShuffleIndices[] = {1, 1, 5, 5, 9, 9, 13, 13, |
| 3, 3, 7, 7, 11, 11, 15, 15}; |
| |
| void P210ToARGBRow_NEON(const uint16_t* src_y, |
| const uint16_t* src_uv, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| const uvec8* uv_coeff = &yuvconstants->kUVCoeff; |
| const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; |
| asm volatile( |
| YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" |
| "ldr q2, [%[kIndices]] \n" |
| "1: \n" // |
| READYUVP210 NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_uv] "+r"(src_uv), // %[src_uv] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] |
| [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| uint8_t kP410LoadShuffleIndices[] = {1, 5, 9, 13, 17, 21, 25, 29, |
| 3, 7, 11, 15, 19, 23, 27, 31}; |
| |
| void P410ToARGBRow_NEON(const uint16_t* src_y, |
| const uint16_t* src_uv, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| const uvec8* uv_coeff = &yuvconstants->kUVCoeff; |
| const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; |
| asm volatile( |
| YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" |
| "ldr q2, [%[kIndices]] \n" |
| "1: \n" // |
| READYUVP410 NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_uv] "+r"(src_uv), // %[src_uv] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] |
| [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| void P210ToAR30Row_NEON(const uint16_t* src_y, |
| const uint16_t* src_uv, |
| uint8_t* dst_ar30, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| const uvec8* uv_coeff = &yuvconstants->kUVCoeff; |
| const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; |
| const uint16_t limit = 0x3ff0; |
| asm volatile(YUVTORGB_SETUP |
| "dup v22.8h, %w[limit] \n" |
| "movi v23.8h, #0xc0, lsl #8 \n" // A |
| "ldr q2, [%[kIndices]] \n" |
| "1: \n" READYUVP210 NVTORGB |
| "subs %w[width], %w[width], #8 \n" STOREAR30 |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_uv] "+r"(src_uv), // %[src_uv] |
| [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] |
| [limit] "r"(limit), // %[limit] |
| [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] |
| : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); |
| } |
| |
| void P410ToAR30Row_NEON(const uint16_t* src_y, |
| const uint16_t* src_uv, |
| uint8_t* dst_ar30, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| const uvec8* uv_coeff = &yuvconstants->kUVCoeff; |
| const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; |
| uint16_t limit = 0x3ff0; |
| asm volatile(YUVTORGB_SETUP |
| "dup v22.8h, %w[limit] \n" |
| "movi v23.8h, #0xc0, lsl #8 \n" // A |
| "ldr q2, [%[kIndices]] \n" |
| "1: \n" READYUVP410 NVTORGB |
| "subs %w[width], %w[width], #8 \n" STOREAR30 |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_uv] "+r"(src_uv), // %[src_uv] |
| [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] |
| [limit] "r"(limit), // %[limit] |
| [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] |
| : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); |
| } |
| |
| void I422ToAR30Row_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_ar30, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| const uvec8* uv_coeff = &yuvconstants->kUVCoeff; |
| const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; |
| const uint16_t limit = 0x3ff0; |
| asm volatile ( |
| YUVTORGB_SETUP |
| "dup v22.8h, %w[limit] \n" |
| "movi v23.8h, #0xc0, lsl #8 \n" // A |
| "1: \n" READYUV422 I4XXTORGB |
| "subs %w[width], %w[width], #8 \n" STOREAR30 |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] |
| [limit] "r"(limit) // %[limit] |
| : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); |
| } |
| |
| void I444AlphaToARGBRow_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| const uint8_t* src_a, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "1: \n" |
| "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 |
| "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [src_a] "+r"(src_a), // %[src_a] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| void I410AlphaToARGBRow_NEON(const uint16_t* src_y, |
| const uint16_t* src_u, |
| const uint16_t* src_v, |
| const uint16_t* src_a, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile (YUVTORGB_SETUP |
| "1: \n" |
| "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410 |
| "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [src_a] "+r"(src_a), // %[src_a] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| void I210AlphaToARGBRow_NEON(const uint16_t* src_y, |
| const uint16_t* src_u, |
| const uint16_t* src_v, |
| const uint16_t* src_a, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile (YUVTORGB_SETUP |
| "1: \n" |
| "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210 |
| "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [src_a] "+r"(src_a), // %[src_a] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| void I422AlphaToARGBRow_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| const uint8_t* src_a, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "1: \n" |
| "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 |
| "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [src_a] "+r"(src_a), // %[src_a] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| void I422ToRGBARow_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_rgba, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "movi v15.8b, #255 \n" /* A */ |
| "1: \n" READYUV422 I4XXTORGB |
| RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v15"); |
| } |
| |
| void I422ToRGB24Row_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_rgb24, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "1: \n" READYUV422 I4XXTORGB |
| RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS); |
| } |
| |
| #define ARGBTORGB565 \ |
| /* Inputs: \ |
| * v16: bbbbbxxx \ |
| * v17: ggggggxx \ |
| * v18: rrrrrxxx */ \ |
| "shll v18.8h, v18.8b, #8 \n" /* rrrrrrxx00000000 */ \ |
| "shll v17.8h, v17.8b, #8 \n" /* gggggxxx00000000 */ \ |
| "shll v16.8h, v16.8b, #8 \n" /* bbbbbbxx00000000 */ \ |
| "sri v18.8h, v17.8h, #5 \n" /* rrrrrgggggg00000 */ \ |
| "sri v18.8h, v16.8h, #11 \n" /* rrrrrggggggbbbbb */ |
| |
| #define ARGBTORGB565_FROM_TOP \ |
| /* Inputs: \ |
| * v16: bbbbbxxxxxxxxxxx \ |
| * v17: ggggggxxxxxxxxxx \ |
| * v18: rrrrrxxxxxxxxxxx */ \ |
| "sri v18.8h, v17.8h, #5 \n" /* rrrrrgggggg00000 */ \ |
| "sri v18.8h, v16.8h, #11 \n" /* rrrrrggggggbbbbb */ |
| |
| void I422ToRGB565Row_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_rgb565, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "1: \n" READYUV422 I4XXTORGB |
| RGBTORGB8_TOP |
| "subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP |
| "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS); |
| } |
| |
| #define ARGBTOARGB1555 \ |
| /* Inputs: \ |
| * v16: gggggxxxbbbbbxxx v17: axxxxxxxrrrrrxxx */ \ |
| "shl v1.8h, v16.8h, #8 \n" /* bbbbbxxx00000000 */ \ |
| "shl v2.8h, v17.8h, #8 \n" /* rrrrrxxx00000000 */ \ |
| "sri v17.8h, v2.8h, #1 \n" /* arrrrrxxxrrrrxxx */ \ |
| "sri v17.8h, v16.8h, #6 \n" /* arrrrrgggggxxxbb */ \ |
| "sri v17.8h, v1.8h, #11 \n" /* arrrrrgggggbbbbb */ |
| |
| #define ARGBTOARGB1555_FROM_TOP \ |
| /* Inputs: \ |
| * v16: bbbbbxxxxxxxxxxx v17: gggggxxxxxxxxxxx \ |
| * v18: rrrrrxxxxxxxxxxx v19: axxxxxxxxxxxxxxx */ \ |
| "sri v19.8h, v18.8h, #1 \n" /* arrrrrxxxxxxxxxx */ \ |
| "sri v19.8h, v17.8h, #6 \n" /* arrrrrgggggxxxxx */ \ |
| "sri v19.8h, v16.8h, #11 \n" /* arrrrrgggggbbbbb */ |
| |
| void I422ToARGB1555Row_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_argb1555, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile (YUVTORGB_SETUP |
| "movi v19.8h, #0x80, lsl #8 \n" |
| "1: \n" // |
| READYUV422 I4XXTORGB RGBTORGB8_TOP |
| "subs %w[width], %w[width], #8 \n" // |
| ARGBTOARGB1555_FROM_TOP |
| "st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels RGB1555. |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| #define ARGBTOARGB4444 \ |
| /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A */ \ |
| "sri v17.8b, v16.8b, #4 \n" /* BG */ \ |
| "sri v19.8b, v18.8b, #4 \n" /* RA */ \ |
| "zip1 v0.16b, v17.16b, v19.16b \n" /* BGRA */ |
| |
| void I422ToARGB4444Row_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_argb4444, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "1: \n" READYUV422 I4XXTORGB |
| RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "movi v19.8b, #255 \n" ARGBTOARGB4444 |
| "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8 |
| // pixels |
| // ARGB4444. |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_u] "+r"(src_u), // %[src_u] |
| [src_v] "+r"(src_v), // %[src_v] |
| [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| void I400ToARGBRow_NEON(const uint8_t* src_y, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "movi v1.16b, #128 \n" |
| "movi v19.8b, #255 \n" |
| "umull v6.8h, v1.8b, v30.8b \n" |
| "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ |
| "umull v4.8h, v1.8b, v28.8b \n" /* DB */ |
| "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ |
| "1: \n" READYUV400 I400TORGB |
| RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] |
| : "cc", "memory", YUVTORGB_REGS, "v19"); |
| } |
| |
| #if defined(LIBYUV_USE_ST4) |
| void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { |
| asm volatile ( |
| "movi v23.8b, #255 \n" |
| "1: \n" |
| "ld1 {v20.8b}, [%0], #8 \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "mov v21.8b, v20.8b \n" |
| "mov v22.8b, v20.8b \n" |
| "subs %w2, %w2, #8 \n" |
| "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
| "b.gt 1b \n" |
| : "+r"(src_y), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v20", "v21", "v22", "v23"); |
| } |
| #else |
| void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { |
| asm volatile ( |
| "movi v20.8b, #255 \n" |
| "1: \n" |
| "ldr d16, [%0], #8 \n" |
| "subs %w2, %w2, #8 \n" |
| "zip1 v18.16b, v16.16b, v16.16b \n" // YY |
| "zip1 v19.16b, v16.16b, v20.16b \n" // YA |
| "prfm pldl1keep, [%0, 448] \n" |
| "zip1 v16.16b, v18.16b, v19.16b \n" // YYYA |
| "zip2 v17.16b, v18.16b, v19.16b \n" |
| "stp q16, q17, [%1], #32 \n" |
| "b.gt 1b \n" |
| : "+r"(src_y), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v16", "v17", "v18", "v19", "v20"); |
| } |
| #endif // LIBYUV_USE_ST4 |
| |
| void NV12ToARGBRow_NEON(const uint8_t* src_y, |
| const uint8_t* src_uv, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" |
| "ldr q2, [%[kNV12Table]] \n" |
| "1: \n" READNV12 NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_uv] "+r"(src_uv), // %[src_uv] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] |
| [kNV12Table] "r"(&kNV12Table) |
| : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); |
| } |
| |
| void NV21ToARGBRow_NEON(const uint8_t* src_y, |
| const uint8_t* src_vu, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" |
| "ldr q2, [%[kNV12Table]] \n" |
| "1: \n" READNV12 NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_uv] "+r"(src_vu), // %[src_uv] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] |
| [kNV12Table] "r"(&kNV21Table) |
| : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); |
| } |
| |
| void NV12ToRGB24Row_NEON(const uint8_t* src_y, |
| const uint8_t* src_uv, |
| uint8_t* dst_rgb24, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "ldr q2, [%[kNV12Table]] \n" |
| "1: \n" READNV12 NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_uv] "+r"(src_uv), // %[src_uv] |
| [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] |
| [kNV12Table] "r"(&kNV12Table) |
| : "cc", "memory", YUVTORGB_REGS, "v2"); |
| } |
| |
| void NV21ToRGB24Row_NEON(const uint8_t* src_y, |
| const uint8_t* src_vu, |
| uint8_t* dst_rgb24, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "ldr q2, [%[kNV12Table]] \n" |
| "1: \n" READNV12 NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_uv] "+r"(src_vu), // %[src_uv] |
| [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] |
| [kNV12Table] "r"(&kNV21Table) |
| : "cc", "memory", YUVTORGB_REGS, "v2"); |
| } |
| |
| void NV12ToRGB565Row_NEON(const uint8_t* src_y, |
| const uint8_t* src_uv, |
| uint8_t* dst_rgb565, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "ldr q2, [%[kNV12Table]] \n" |
| "1: \n" READNV12 NVTORGB |
| RGBTORGB8_TOP |
| "subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP |
| "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 |
| // pixels |
| // RGB565. |
| "b.gt 1b \n" |
| : [src_y] "+r"(src_y), // %[src_y] |
| [src_uv] "+r"(src_uv), // %[src_uv] |
| [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] |
| [kNV12Table] "r"(&kNV12Table) |
| : "cc", "memory", YUVTORGB_REGS, "v2"); |
| } |
| |
| void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" |
| "ldr q2, [%[kNV21InterleavedTable]] \n" |
| "1: \n" READYUY2 NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] |
| [kNV21InterleavedTable] "r"(&kNV21InterleavedTable) |
| : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); |
| } |
| |
| void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| asm volatile ( |
| YUVTORGB_SETUP |
| "movi v19.8b, #255 \n" |
| "ldr q2, [%[kNV12InterleavedTable]] \n" |
| "1: \n" READUYVY NVTORGB RGBTORGB8 |
| "subs %w[width], %w[width], #8 \n" |
| "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" |
| "b.gt 1b \n" |
| : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] |
| [dst_argb] "+r"(dst_argb), // %[dst_argb] |
| [width] "+r"(width) // %[width] |
| : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] |
| [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] |
| [kNV12InterleavedTable] "r"(&kNV12InterleavedTable) |
| : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); |
| } |
| |
| // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. |
| void SplitUVRow_NEON(const uint8_t* src_uv, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV |
| "subs %w3, %w3, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 448] \n" |
| "st1 {v0.16b}, [%1], #16 \n" // store U |
| "st1 {v1.16b}, [%2], #16 \n" // store V |
| "b.gt 1b \n" |
| : "+r"(src_uv), // %0 |
| "+r"(dst_u), // %1 |
| "+r"(dst_v), // %2 |
| "+r"(width) // %3 // Output registers |
| : // Input registers |
| : "cc", "memory", "v0", "v1" // Clobber List |
| ); |
| } |
| |
| // Reads 16 byte Y's from tile and writes out 16 Y's. |
| // MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes |
| // MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes |
| // width measured in bytes so 8 UV = 16. |
| void DetileRow_NEON(const uint8_t* src, |
| ptrdiff_t src_tile_stride, |
| uint8_t* dst, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes |
| "subs %w2, %w2, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 1792] \n" // 7 tiles of 256b ahead |
| "st1 {v0.16b}, [%1], #16 \n" // store 16 bytes |
| "b.gt 1b \n" |
| : "+r"(src), // %0 |
| "+r"(dst), // %1 |
| "+r"(width) // %2 |
| : "r"(src_tile_stride) // %3 |
| : "cc", "memory", "v0" // Clobber List |
| ); |
| } |
| |
| // Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's. |
| void DetileRow_16_NEON(const uint16_t* src, |
| ptrdiff_t src_tile_stride, |
| uint16_t* dst, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels |
| "subs %w2, %w2, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 3584] \n" // 7 tiles of 512b ahead |
| "st1 {v0.8h,v1.8h}, [%1], #32 \n" // store 16 pixels |
| "b.gt 1b \n" |
| : "+r"(src), // %0 |
| "+r"(dst), // %1 |
| "+r"(width) // %2 |
| : "r"(src_tile_stride * 2) // %3 |
| : "cc", "memory", "v0", "v1" // Clobber List |
| ); |
| } |
| |
| // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. |
| void DetileSplitUVRow_NEON(const uint8_t* src_uv, |
| ptrdiff_t src_tile_stride, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld2 {v0.8b,v1.8b}, [%0], %4 \n" |
| "subs %w3, %w3, #16 \n" |
| "prfm pldl1keep, [%0, 1792] \n" |
| "st1 {v0.8b}, [%1], #8 \n" |
| "st1 {v1.8b}, [%2], #8 \n" |
| "b.gt 1b \n" |
| : "+r"(src_uv), // %0 |
| "+r"(dst_u), // %1 |
| "+r"(dst_v), // %2 |
| "+r"(width) // %3 |
| : "r"(src_tile_stride) // %4 |
| : "cc", "memory", "v0", "v1" // Clobber List |
| ); |
| } |
| |
| #if defined(LIBYUV_USE_ST2) |
| // Read 16 Y, 8 UV, and write 8 YUY2 |
| void DetileToYUY2_NEON(const uint8_t* src_y, |
| ptrdiff_t src_y_tile_stride, |
| const uint8_t* src_uv, |
| ptrdiff_t src_uv_tile_stride, |
| uint8_t* dst_yuy2, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys |
| "prfm pldl1keep, [%0, 1792] \n" |
| "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs |
| "prfm pldl1keep, [%1, 1792] \n" |
| "subs %w3, %w3, #16 \n" // store 8 YUY2 |
| "st2 {v0.16b,v1.16b}, [%2], #32 \n" |
| "b.gt 1b \n" |
| : "+r"(src_y), // %0 |
| "+r"(src_uv), // %1 |
| "+r"(dst_yuy2), // %2 |
| "+r"(width) // %3 |
| : "r"(src_y_tile_stride), // %4 |
| "r"(src_uv_tile_stride) // %5 |
| : "cc", "memory", "v0", "v1" // Clobber list |
| ); |
| } |
| #else |
| // Read 16 Y, 8 UV, and write 8 YUY2 |
| void DetileToYUY2_NEON(const uint8_t* src_y, |
| ptrdiff_t src_y_tile_stride, |
| const uint8_t* src_uv, |
| ptrdiff_t src_uv_tile_stride, |
| uint8_t* dst_yuy2, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys |
| "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs |
| "subs %w3, %w3, #16 \n" |
| "prfm pldl1keep, [%0, 1792] \n" |
| "zip1 v2.16b, v0.16b, v1.16b \n" |
| "prfm pldl1keep, [%1, 1792] \n" |
| "zip2 v3.16b, v0.16b, v1.16b \n" |
| "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 8 YUY2 |
| "b.gt 1b \n" |
| : "+r"(src_y), // %0 |
| "+r"(src_uv), // %1 |
| "+r"(dst_yuy2), // %2 |
| "+r"(width) // %3 |
| : "r"(src_y_tile_stride), // %4 |
| "r"(src_uv_tile_stride) // %5 |
| : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber list |
| ); |
| } |
| #endif |
| |
| // Unpack MT2T into tiled P010 64 pixels at a time. See |
| // tinyurl.com/mtk-10bit-video-format for format documentation. |
| void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v7.16b}, [%0], #16 \n" |
| "ld1 {v0.16b-v3.16b}, [%0], #64 \n" |
| "shl v4.16b, v7.16b, #6 \n" |
| "shl v5.16b, v7.16b, #4 \n" |
| "shl v6.16b, v7.16b, #2 \n" |
| "subs %2, %2, #80 \n" |
| "zip1 v16.16b, v4.16b, v0.16b \n" |
| "zip1 v18.16b, v5.16b, v1.16b \n" |
| "zip1 v20.16b, v6.16b, v2.16b \n" |
| "zip1 v22.16b, v7.16b, v3.16b \n" |
| "zip2 v17.16b, v4.16b, v0.16b \n" |
| "zip2 v19.16b, v5.16b, v1.16b \n" |
| "zip2 v21.16b, v6.16b, v2.16b \n" |
| "zip2 v23.16b, v7.16b, v3.16b \n" |
| "sri v16.8h, v16.8h, #10 \n" |
| "sri v17.8h, v17.8h, #10 \n" |
| "sri v18.8h, v18.8h, #10 \n" |
| "sri v19.8h, v19.8h, #10 \n" |
| "st1 {v16.8h-v19.8h}, [%1], #64 \n" |
| "sri v20.8h, v20.8h, #10 \n" |
| "sri v21.8h, v21.8h, #10 \n" |
| "sri v22.8h, v22.8h, #10 \n" |
| "sri v23.8h, v23.8h, #10 \n" |
| "st1 {v20.8h-v23.8h}, [%1], #64 \n" |
| "b.gt 1b \n" |
| : "+r"(src), // %0 |
| "+r"(dst), // %1 |
| "+r"(size) // %2 |
| : |
| : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); |
| } |
| |
| #if defined(LIBYUV_USE_ST2) |
| // Reads 16 U's and V's and writes out 16 pairs of UV. |
| void MergeUVRow_NEON(const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_uv, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v0.16b}, [%0], #16 \n" // load U |
| "ld1 {v1.16b}, [%1], #16 \n" // load V |
| "subs %w3, %w3, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 448] \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV |
| "b.gt 1b \n" |
| : "+r"(src_u), // %0 |
| "+r"(src_v), // %1 |
| "+r"(dst_uv), // %2 |
| "+r"(width) // %3 // Output registers |
| : // Input registers |
| : "cc", "memory", "v0", "v1" // Clobber List |
| ); |
| } |
| |
| void MergeUVRow_16_NEON(const uint16_t* src_u, |
| const uint16_t* src_v, |
| uint16_t* dst_uv, |
| int depth, |
| int width) { |
| int shift = 16 - depth; |
| asm volatile ( |
| "dup v2.8h, %w4 \n" |
| "1: \n" |
| "ld1 {v0.8h}, [%0], #16 \n" // load 8 U |
| "subs %w3, %w3, #8 \n" // 8 src pixels per loop |
| "ld1 {v1.8h}, [%1], #16 \n" // load 8 V |
| "ushl v0.8h, v0.8h, v2.8h \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "ushl v1.8h, v1.8h, v2.8h \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels |
| "b.gt 1b \n" |
| : "+r"(src_u), // %0 |
| "+r"(src_v), // %1 |
| "+r"(dst_uv), // %2 |
| "+r"(width) // %3 |
| : "r"(shift) // %4 |
| : "cc", "memory", "v0", "v1", "v2"); |
| } |
| #else |
| // Reads 16 U's and V's and writes out 16 pairs of UV. |
| void MergeUVRow_NEON(const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_uv, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v0.16b}, [%0], #16 \n" // load U |
| "ld1 {v1.16b}, [%1], #16 \n" // load V |
| "subs %w3, %w3, #16 \n" // 16 processed per loop |
| "zip1 v2.16b, v0.16b, v1.16b \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "zip2 v3.16b, v0.16b, v1.16b \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV |
| "b.gt 1b \n" |
| : "+r"(src_u), // %0 |
| "+r"(src_v), // %1 |
| "+r"(dst_uv), // %2 |
| "+r"(width) // %3 // Output registers |
| : // Input registers |
| : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| ); |
| } |
| |
| void MergeUVRow_16_NEON(const uint16_t* src_u, |
| const uint16_t* src_v, |
| uint16_t* dst_uv, |
| int depth, |
| int width) { |
| int shift = 16 - depth; |
| asm volatile ( |
| "dup v4.8h, %w4 \n" |
| "1: \n" |
| "ld1 {v0.8h}, [%0], #16 \n" // load 8 U |
| "subs %w3, %w3, #8 \n" // 8 src pixels per loop |
| "ld1 {v1.8h}, [%1], #16 \n" // load 8 V |
| "ushl v0.8h, v0.8h, v4.8h \n" |
| "ushl v1.8h, v1.8h, v4.8h \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "zip1 v2.8h, v0.8h, v1.8h \n" |
| "zip2 v3.8h, v0.8h, v1.8h \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels |
| "b.gt 1b \n" |
| : "+r"(src_u), // %0 |
| "+r"(src_v), // %1 |
| "+r"(dst_uv), // %2 |
| "+r"(width) // %3 |
| : "r"(shift) // %4 |
| : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4"); |
| } |
| #endif // LIBYUV_USE_ST2 |
| |
| // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. |
| void SplitRGBRow_NEON(const uint8_t* src_rgb, |
| uint8_t* dst_r, |
| uint8_t* dst_g, |
| uint8_t* dst_b, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB |
| "subs %w4, %w4, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 448] \n" |
| "st1 {v0.16b}, [%1], #16 \n" // store R |
| "st1 {v1.16b}, [%2], #16 \n" // store G |
| "st1 {v2.16b}, [%3], #16 \n" // store B |
| "b.gt 1b \n" |
| : "+r"(src_rgb), // %0 |
| "+r"(dst_r), // %1 |
| "+r"(dst_g), // %2 |
| "+r"(dst_b), // %3 |
| "+r"(width) // %4 |
| : // Input registers |
| : "cc", "memory", "v0", "v1", "v2" // Clobber List |
| ); |
| } |
| |
| // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time |
| void MergeRGBRow_NEON(const uint8_t* src_r, |
| const uint8_t* src_g, |
| const uint8_t* src_b, |
| uint8_t* dst_rgb, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v0.16b}, [%0], #16 \n" // load R |
| "ld1 {v1.16b}, [%1], #16 \n" // load G |
| "ld1 {v2.16b}, [%2], #16 \n" // load B |
| "subs %w4, %w4, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 448] \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "prfm pldl1keep, [%2, 448] \n" |
| "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB |
| "b.gt 1b \n" |
| : "+r"(src_r), // %0 |
| "+r"(src_g), // %1 |
| "+r"(src_b), // %2 |
| "+r"(dst_rgb), // %3 |
| "+r"(width) // %4 |
| : // Input registers |
| : "cc", "memory", "v0", "v1", "v2" // Clobber List |
| ); |
| } |
| |
| // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. |
| void SplitARGBRow_NEON(const uint8_t* src_rgba, |
| uint8_t* dst_r, |
| uint8_t* dst_g, |
| uint8_t* dst_b, |
| uint8_t* dst_a, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB |
| "subs %w5, %w5, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 448] \n" |
| "st1 {v0.16b}, [%3], #16 \n" // store B |
| "st1 {v1.16b}, [%2], #16 \n" // store G |
| "st1 {v2.16b}, [%1], #16 \n" // store R |
| "st1 {v3.16b}, [%4], #16 \n" // store A |
| "b.gt 1b \n" |
| : "+r"(src_rgba), // %0 |
| "+r"(dst_r), // %1 |
| "+r"(dst_g), // %2 |
| "+r"(dst_b), // %3 |
| "+r"(dst_a), // %4 |
| "+r"(width) // %5 |
| : // Input registers |
| : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| ); |
| } |
| |
| #if defined(LIBYUV_USE_ST4) |
| // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time |
| void MergeARGBRow_NEON(const uint8_t* src_r, |
| const uint8_t* src_g, |
| const uint8_t* src_b, |
| const uint8_t* src_a, |
| uint8_t* dst_argb, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v0.16b}, [%2], #16 \n" // load B |
| "ld1 {v1.16b}, [%1], #16 \n" // load G |
| "ld1 {v2.16b}, [%0], #16 \n" // load R |
| "ld1 {v3.16b}, [%3], #16 \n" // load A |
| "subs %w5, %w5, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 448] \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "prfm pldl1keep, [%2, 448] \n" |
| "prfm pldl1keep, [%3, 448] \n" |
| "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB |
| "b.gt 1b \n" |
| : "+r"(src_r), // %0 |
| "+r"(src_g), // %1 |
| "+r"(src_b), // %2 |
| "+r"(src_a), // %3 |
| "+r"(dst_argb), // %4 |
| "+r"(width) // %5 |
| : // Input registers |
| : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| ); |
| } |
| #else |
| // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time |
| void MergeARGBRow_NEON(const uint8_t* src_r, |
| const uint8_t* src_g, |
| const uint8_t* src_b, |
| const uint8_t* src_a, |
| uint8_t* dst_argb, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v0.16b}, [%2], #16 \n" // load B |
| "ld1 {v1.16b}, [%1], #16 \n" // load G |
| "ld1 {v2.16b}, [%0], #16 \n" // load R |
| "ld1 {v3.16b}, [%3], #16 \n" // load A |
| "subs %w5, %w5, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%2, 448] \n" |
| "zip1 v4.16b, v0.16b, v1.16b \n" // BG |
| "zip1 v5.16b, v2.16b, v3.16b \n" // RA |
| "prfm pldl1keep, [%1, 448] \n" |
| "zip2 v6.16b, v0.16b, v1.16b \n" // BG |
| "zip2 v7.16b, v2.16b, v3.16b \n" // RA |
| "prfm pldl1keep, [%0, 448] \n" |
| "zip1 v0.8h, v4.8h, v5.8h \n" // BGRA |
| "zip2 v1.8h, v4.8h, v5.8h \n" |
| "prfm pldl1keep, [%3, 448] \n" |
| "zip1 v2.8h, v6.8h, v7.8h \n" |
| "zip2 v3.8h, v6.8h, v7.8h \n" |
| "st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB |
| "b.gt 1b \n" |
| : "+r"(src_r), // %0 |
| "+r"(src_g), // %1 |
| "+r"(src_b), // %2 |
| "+r"(src_a), // %3 |
| "+r"(dst_argb), // %4 |
| "+r"(width) // %5 |
| : // Input registers |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
| "v7" // Clobber List |
| ); |
| } |
| #endif // LIBYUV_USE_ST4 |
| |
| // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. |
| void SplitXRGBRow_NEON(const uint8_t* src_rgba, |
| uint8_t* dst_r, |
| uint8_t* dst_g, |
| uint8_t* dst_b, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB |
| "subs %w4, %w4, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 448] \n" |
| "st1 {v0.16b}, [%3], #16 \n" // store B |
| "st1 {v1.16b}, [%2], #16 \n" // store G |
| "st1 {v2.16b}, [%1], #16 \n" // store R |
| "b.gt 1b \n" |
| : "+r"(src_rgba), // %0 |
| "+r"(dst_r), // %1 |
| "+r"(dst_g), // %2 |
| "+r"(dst_b), // %3 |
| "+r"(width) // %4 |
| : // Input registers |
| : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| ); |
| } |
| |
| // Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time |
| void MergeXRGBRow_NEON(const uint8_t* src_r, |
| const uint8_t* src_g, |
| const uint8_t* src_b, |
| uint8_t* dst_argb, |
| int width) { |
| asm volatile ( |
| "movi v3.16b, #255 \n" // load A(255) |
| "1: \n" |
| "ld1 {v2.16b}, [%0], #16 \n" // load R |
| "ld1 {v1.16b}, [%1], #16 \n" // load G |
| "ld1 {v0.16b}, [%2], #16 \n" // load B |
| "subs %w4, %w4, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 448] \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "prfm pldl1keep, [%2, 448] \n" |
| "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB |
| "b.gt 1b \n" |
| : "+r"(src_r), // %0 |
| "+r"(src_g), // %1 |
| "+r"(src_b), // %2 |
| "+r"(dst_argb), // %3 |
| "+r"(width) // %4 |
| : // Input registers |
| : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| ); |
| } |
| |
| void MergeXR30Row_NEON(const uint16_t* src_r, |
| const uint16_t* src_g, |
| const uint16_t* src_b, |
| uint8_t* dst_ar30, |
| int depth, |
| int width) { |
| int shift = 10 - depth; |
| asm volatile ( |
| "movi v30.16b, #255 \n" |
| "ushr v30.4s, v30.4s, #22 \n" // 1023 |
| "dup v31.4s, %w5 \n" |
| "1: \n" |
| "ldr d2, [%2], #8 \n" // B |
| "ldr d1, [%1], #8 \n" // G |
| "ldr d0, [%0], #8 \n" // R |
| "ushll v2.4s, v2.4h, #0 \n" // B |
| "ushll v1.4s, v1.4h, #0 \n" // G |
| "ushll v0.4s, v0.4h, #0 \n" // R |
| "ushl v2.4s, v2.4s, v31.4s \n" // 000B |
| "ushl v1.4s, v1.4s, v31.4s \n" // G |
| "ushl v0.4s, v0.4s, v31.4s \n" // R |
| "umin v2.4s, v2.4s, v30.4s \n" |
| "umin v1.4s, v1.4s, v30.4s \n" |
| "umin v0.4s, v0.4s, v30.4s \n" |
| "sli v2.4s, v1.4s, #10 \n" // 00GB |
| "sli v2.4s, v0.4s, #20 \n" // 0RGB |
| "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) |
| "subs %w4, %w4, #4 \n" |
| "str q2, [%3], #16 \n" |
| "b.gt 1b \n" |
| : "+r"(src_r), // %0 |
| "+r"(src_g), // %1 |
| "+r"(src_b), // %2 |
| "+r"(dst_ar30), // %3 |
| "+r"(width) // %4 |
| : "r"(shift) // %5 |
| : "memory", "cc", "v0", "v1", "v2", "v30", "v31"); |
| } |
| |
| void MergeXR30Row_10_NEON(const uint16_t* src_r, |
| const uint16_t* src_g, |
| const uint16_t* src_b, |
| uint8_t* dst_ar30, |
| int /* depth */, |
| int width) { |
| // Neon has no "shift left and accumulate/orr", so use a multiply-add to |
| // perform the shift instead. |
| int limit = 1023; |
| asm volatile ( |
| "dup v5.8h, %w[limit] \n" |
| "movi v6.8h, #16 \n" // 1 << 4 |
| "movi v7.8h, #4, lsl #8 \n" // 1 << 10 |
| "1: \n" |
| "ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr |
| "ldr q1, [%1], #16 \n" // xxxxxxGggggggggg |
| "ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb |
| "umin v0.8h, v0.8h, v5.8h \n" // 000000Rrrrrrrrrr |
| "umin v1.8h, v1.8h, v5.8h \n" // 000000Gggggggggg |
| "movi v4.8h, #0xc0, lsl #8 \n" // 1100000000000000 |
| "umin v3.8h, v2.8h, v5.8h \n" // 000000Bbbbbbbbbb |
| "mla v4.8h, v0.8h, v6.8h \n" // 11Rrrrrrrrrr0000 |
| "mla v3.8h, v1.8h, v7.8h \n" // ggggggBbbbbbbbbb |
| "usra v4.8h, v1.8h, #6 \n" // 11RrrrrrrrrrGggg |
| "subs %w4, %w4, #8 \n" |
| "st2 {v3.8h, v4.8h}, [%3], #32 \n" |
| "b.gt 1b \n" |
| : "+r"(src_r), // %0 |
| "+r"(src_g), // %1 |
| "+r"(src_b), // %2 |
| "+r"(dst_ar30), // %3 |
| "+r"(width) // %4 |
| : [limit] "r"(limit) |
| : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); |
| } |
| |
| void MergeAR64Row_NEON(const uint16_t* src_r, |
| const uint16_t* src_g, |
| const uint16_t* src_b, |
| const uint16_t* src_a, |
| uint16_t* dst_ar64, |
| int depth, |
| int width) { |
| int shift = 16 - depth; |
| int mask = (1 << depth) - 1; |
| asm volatile ( |
| |
| "dup v30.8h, %w7 \n" |
| "dup v31.8h, %w6 \n" |
| "1: \n" |
| "ldr q2, [%0], #16 \n" // R |
| "ldr q1, [%1], #16 \n" // G |
| "ldr q0, [%2], #16 \n" // B |
| "ldr q3, [%3], #16 \n" // A |
| "umin v2.8h, v2.8h, v30.8h \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "umin v1.8h, v1.8h, v30.8h \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "umin v0.8h, v0.8h, v30.8h \n" |
| "prfm pldl1keep, [%2, 448] \n" |
| "umin v3.8h, v3.8h, v30.8h \n" |
| "prfm pldl1keep, [%3, 448] \n" |
| "ushl v2.8h, v2.8h, v31.8h \n" |
| "ushl v1.8h, v1.8h, v31.8h \n" |
| "ushl v0.8h, v0.8h, v31.8h \n" |
| "ushl v3.8h, v3.8h, v31.8h \n" |
| "subs %w5, %w5, #8 \n" |
| "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n" |
| "b.gt 1b \n" |
| : "+r"(src_r), // %0 |
| "+r"(src_g), // %1 |
| "+r"(src_b), // %2 |
| "+r"(src_a), // %3 |
| "+r"(dst_ar64), // %4 |
| "+r"(width) // %5 |
| : "r"(shift), // %6 |
| "r"(mask) // %7 |
| : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); |
| } |
| |
| void MergeXR64Row_NEON(const uint16_t* src_r, |
| const uint16_t* src_g, |
| const uint16_t* src_b, |
| uint16_t* dst_ar64, |
| int depth, |
| int width) { |
| int shift = 16 - depth; |
| int mask = (1 << depth) - 1; |
| asm volatile ( |
| |
| "movi v3.16b, #0xff \n" // A (0xffff) |
| "dup v30.8h, %w6 \n" |
| "dup v31.8h, %w5 \n" |
| |
| "1: \n" |
| "ldr q2, [%0], #16 \n" // R |
| "ldr q1, [%1], #16 \n" // G |
| "ldr q0, [%2], #16 \n" // B |
| "umin v2.8h, v2.8h, v30.8h \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "umin v1.8h, v1.8h, v30.8h \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "umin v0.8h, v0.8h, v30.8h \n" |
| "prfm pldl1keep, [%2, 448] \n" |
| "ushl v2.8h, v2.8h, v31.8h \n" |
| "ushl v1.8h, v1.8h, v31.8h \n" |
| "ushl v0.8h, v0.8h, v31.8h \n" |
| "subs %w4, %w4, #8 \n" |
| "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" |
| "b.gt 1b \n" |
| : "+r"(src_r), // %0 |
| "+r"(src_g), // %1 |
| "+r"(src_b), // %2 |
| "+r"(dst_ar64), // %3 |
| "+r"(width) // %4 |
| : "r"(shift), // %5 |
| "r"(mask) // %6 |
| : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); |
| } |
| |
| void MergeARGB16To8Row_NEON(const uint16_t* src_r, |
| const uint16_t* src_g, |
| const uint16_t* src_b, |
| const uint16_t* src_a, |
| uint8_t* dst_argb, |
| int depth, |
| int width) { |
| // Shift is 8 - depth, +8 so the result is in the top half of each lane. |
| int shift = 16 - depth; |
| asm volatile ( |
| "dup v31.8h, %w6 \n" |
| "1: \n" |
| "ldr q0, [%0], #16 \n" // B |
| "ldr q1, [%1], #16 \n" // G |
| "ldr q2, [%2], #16 \n" // R |
| "ldr q3, [%3], #16 \n" // A |
| "uqshl v0.8h, v0.8h, v31.8h \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "uqshl v1.8h, v1.8h, v31.8h \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "uqshl v2.8h, v2.8h, v31.8h \n" |
| "prfm pldl1keep, [%2, 448] \n" |
| "uqshl v3.8h, v3.8h, v31.8h \n" |
| "prfm pldl1keep, [%3, 448] \n" |
| "trn2 v0.16b, v0.16b, v1.16b \n" |
| "trn2 v1.16b, v2.16b, v3.16b \n" |
| "subs %w5, %w5, #8 \n" |
| "st2 {v0.8h, v1.8h}, [%4], #32 \n" |
| "b.gt 1b \n" |
| : "+r"(src_b), // %0 |
| "+r"(src_g), // %1 |
| "+r"(src_r), // %2 |
| "+r"(src_a), // %3 |
| "+r"(dst_argb), // %4 |
| "+r"(width) // %5 |
| : "r"(shift) // %6 |
| : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); |
| } |
| |
| void MergeXRGB16To8Row_NEON(const uint16_t* src_r, |
| const uint16_t* src_g, |
| const uint16_t* src_b, |
| uint8_t* dst_argb, |
| int depth, |
| int width) { |
| // Shift is 8 - depth, +8 so the result is in the top half of each lane. |
| int shift = 16 - depth; |
| asm volatile ( |
| "dup v31.8h, %w5 \n" |
| "movi v3.16b, #0xff \n" // A (0xff) |
| "1: \n" |
| "ldr q0, [%0], #16 \n" // B |
| "ldr q1, [%1], #16 \n" // G |
| "ldr q2, [%2], #16 \n" // R |
| "uqshl v0.8h, v0.8h, v31.8h \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "uqshl v1.8h, v1.8h, v31.8h \n" |
| "prfm pldl1keep, [%1, 448] \n" |
| "uqshl v2.8h, v2.8h, v31.8h \n" |
| "prfm pldl1keep, [%2, 448] \n" |
| "trn2 v0.16b, v0.16b, v1.16b \n" |
| "trn2 v1.16b, v2.16b, v3.16b \n" |
| "subs %w4, %w4, #8 \n" |
| "st2 {v0.8h, v1.8h}, [%3], #32 \n" |
| "b.gt 1b \n" |
| : "+r"(src_b), // %0 |
| "+r"(src_g), // %1 |
| "+r"(src_r), // %2 |
| "+r"(dst_argb), // %3 |
| "+r"(width) // %4 |
| : "r"(shift) // %5 |
| : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); |
| } |
| |
| // Copy multiple of 32. |
| void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { |
| asm volatile ( |
| "1: \n" |
| "ldp q0, q1, [%0], #32 \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "subs %w2, %w2, #32 \n" // 32 processed per loop |
| "stp q0, q1, [%1], #32 \n" |
| "b.gt 1b \n" |
| : "+r"(src), // %0 |
| "+r"(dst), // %1 |
| "+r"(width) // %2 // Output registers |
| : // Input registers |
| : "cc", "memory", "v0", "v1" // Clobber List |
| ); |
| } |
| |
| // SetRow writes 'width' bytes using an 8 bit value repeated. |
| void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { |
| asm volatile ( |
| "dup v0.16b, %w2 \n" // duplicate 16 bytes |
| "1: \n" |
| "subs %w1, %w1, #16 \n" // 16 bytes per loop |
| "st1 {v0.16b}, [%0], #16 \n" // store |
| "b.gt 1b \n" |
| : "+r"(dst), // %0 |
| "+r"(width) // %1 |
| : "r"(v8) // %2 |
| : "cc", "memory", "v0"); |
| } |
| |
| void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { |
| asm volatile ( |
| "dup v0.4s, %w2 \n" // duplicate 4 ints |
| "1: \n" |
| "subs %w1, %w1, #4 \n" // 4 ints per loop |
| "st1 {v0.16b}, [%0], #16 \n" // store |
| "b.gt 1b \n" |
| : "+r"(dst), // %0 |
| "+r"(width) // %1 |
| : "r"(v32) // %2 |
| : "cc", "memory", "v0"); |
| } |
| |
| // Shuffle table for reversing the bytes. |
| static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, |
| 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; |
| |
| void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { |
| asm volatile ( |
| // Start at end of source row. |
| "ld1 {v3.16b}, [%3] \n" // shuffler |
| "add %0, %0, %w2, sxtw \n" |
| "sub %0, %0, #32 \n" |
| "1: \n" |
| "ldr q2, [%0, 16] \n" |
| "ldr q1, [%0], -32 \n" // src -= 32 |
| "subs %w2, %w2, #32 \n" // 32 pixels per loop. |
| "tbl v0.16b, {v2.16b}, v3.16b \n" |
| "tbl v1.16b, {v1.16b}, v3.16b \n" |
| "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels |
| "b.gt 1b \n" |
| : "+r"(src), // %0 |
| "+r"(dst), // %1 |
| "+r"(width) // %2 |
| : "r"(&kShuffleMirror) // %3 |
| : "cc", "memory", "v0", "v1", "v2", "v3"); |
| } |
| |
| // Shuffle table for reversing the UV. |
| static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, |
| 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; |
| |
| void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { |
| asm volatile ( |
| // Start at end of source row. |
| "ld1 {v4.16b}, [%3] \n" // shuffler |
| "add %0, %0, %w2, sxtw #1 \n" |
| "sub %0, %0, #32 \n" |
| "1: \n" |
| "ldr q1, [%0, 16] \n" |
| "ldr q0, [%0], -32 \n" // src -= 32 |
| "subs %w2, %w2, #16 \n" // 16 pixels per loop. |
| "tbl v2.16b, {v1.16b}, v4.16b \n" |
| "tbl v3.16b, {v0.16b}, v4.16b \n" |
| "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 |
| "b.gt 1b \n" |
| : "+r"(src_uv), // %0 |
| "+r"(dst_uv), // %1 |
| "+r"(width) // %2 |
| : "r"(&kShuffleMirrorUV) // %3 |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); |
| } |
| |
| void MirrorSplitUVRow_NEON(const uint8_t* src_uv, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| asm volatile ( |
| // Start at end of source row. |
| "ld1 {v4.16b}, [%4] \n" // shuffler |
| "add %0, %0, %w3, sxtw #1 \n" |
| "sub %0, %0, #32 \n" |
| "1: \n" |
| "ldr q1, [%0, 16] \n" |
| "ldr q0, [%0], -32 \n" // src -= 32 |
| "subs %w3, %w3, #16 \n" // 16 pixels per loop. |
| "tbl v2.16b, {v1.16b}, v4.16b \n" |
| "tbl v3.16b, {v0.16b}, v4.16b \n" |
| "uzp1 v0.16b, v2.16b, v3.16b \n" // U |
| "uzp2 v1.16b, v2.16b, v3.16b \n" // V |
| "st1 {v0.16b}, [%1], #16 \n" // dst += 16 |
| "st1 {v1.16b}, [%2], #16 \n" |
| "b.gt 1b \n" |
| : "+r"(src_uv), // %0 |
| "+r"(dst_u), // %1 |
| "+r"(dst_v), // %2 |
| "+r"(width) // %3 |
| : "r"(&kShuffleMirrorUV) // %4 |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); |
| } |
| |
| // Shuffle table for reversing the ARGB. |
| static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, |
| 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u}; |
| |
| void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { |
| asm volatile ( |
| // Start at end of source row. |
| "ld1 {v4.16b}, [%3] \n" // shuffler |
| "add %0, %0, %w2, sxtw #2 \n" |
| "sub %0, %0, #32 \n" |
| "1: \n" |
| "ldr q1, [%0, 16] \n" |
| "ldr q0, [%0], -32 \n" // src -= 32 |
| "subs %w2, %w2, #8 \n" // 8 pixels per loop. |
| "tbl v2.16b, {v1.16b}, v4.16b \n" |
| "tbl v3.16b, {v0.16b}, v4.16b \n" |
| "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : "r"(&kShuffleMirrorARGB) // %3 |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); |
| } |
| |
| void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, |
| uint8_t* dst_rgb24, |
| int width) { |
| asm volatile ( |
| "ld1 {v3.16b}, [%4] \n" // shuffler |
| "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. |
| "add %0, %0, %w2, sxtw \n" |
| "sub %0, %0, #48 \n" |
| |
| "1: \n" |
| "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48 |
| "subs %w2, %w2, #16 \n" // 16 pixels per loop. |
| "tbl v0.16b, {v0.16b}, v3.16b \n" |
| "tbl v1.16b, {v1.16b}, v3.16b \n" |
| "tbl v2.16b, {v2.16b}, v3.16b \n" |
| "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48 |
| "b.gt 1b \n" |
| : "+r"(src_rgb24), // %0 |
| "+r"(dst_rgb24), // %1 |
| "+r"(width) // %2 |
| : "r"((ptrdiff_t)-48), // %3 |
| "r"(&kShuffleMirror) // %4 |
| : "cc", "memory", "v0", "v1", "v2", "v3"); |
| } |
| |
| void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, |
| uint8_t* dst_argb, |
| int width) { |
| asm volatile ( |
| "movi v4.8b, #255 \n" // Alpha |
| "1: \n" |
| "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of |
| // RGB24. |
| "prfm pldl1keep, [%0, 448] \n" |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB |
| "b.gt 1b \n" |
| : "+r"(src_rgb24), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
| ); |
| } |
| |
| void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { |
| asm volatile ( |
| "movi v5.8b, #255 \n" // Alpha |
| "1: \n" |
| "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "mov v3.8b, v1.8b \n" // move g |
| "prfm pldl1keep, [%0, 448] \n" |
| "mov v4.8b, v0.8b \n" // move r |
| "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a |
| "b.gt 1b \n" |
| : "+r"(src_raw), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
| ); |
| } |
| |
| void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { |
| asm volatile ( |
| "movi v0.8b, #255 \n" // Alpha |
| "1: \n" |
| "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "mov v2.8b, v4.8b \n" // move g |
| "prfm pldl1keep, [%0, 448] \n" |
| "mov v1.8b, v5.8b \n" // move r |
| "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r |
| "b.gt 1b \n" |
| : "+r"(src_raw), // %0 |
| "+r"(dst_rgba), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
| ); |
| } |
| |
| void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { |
| asm volatile ( |
| "1: \n" |
| "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "mov v3.8b, v1.8b \n" // move g |
| "prfm pldl1keep, [%0, 448] \n" |
| "mov v4.8b, v0.8b \n" // move r |
| "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r |
| "b.gt 1b \n" |
| : "+r"(src_raw), // %0 |
| "+r"(dst_rgb24), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List |
| ); |
| } |
| |
| #define RGB565TOARGB \ |
| /* Input: v0/v4.8h: RRRRRGGGGGGBBBBB */ \ |
| "shrn v1.8b, v0.8h, #3 \n" /* G GGGGGGxx */ \ |
| "shrn2 v1.16b, v4.8h, #3 \n" /* G GGGGGGxx */ \ |
| "uzp2 v2.16b, v0.16b, v4.16b \n" /* R RRRRRxxx */ \ |
| "uzp1 v0.16b, v0.16b, v4.16b \n" /* B xxxBBBBB */ \ |
| "sri v1.16b, v1.16b, #6 \n" /* G GGGGGGGG, fill 2 */ \ |
| "shl v0.16b, v0.16b, #3 \n" /* B BBBBB000 */ \ |
| "sri v2.16b, v2.16b, #5 \n" /* R RRRRRRRR, fill 3 */ \ |
| "sri v0.16b, v0.16b, #5 \n" /* R BBBBBBBB, fill 3 */ |
| |
| void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, |
| uint8_t* dst_argb, |
| int width) { |
| asm volatile( |
| "movi v3.16b, #255 \n" // Alpha |
| "1: \n" |
| "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels |
| "subs %w2, %w2, #16 \n" // 16 processed per loop |
| "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB |
| "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB |
| "add %1, %1, #64 \n" |
| "b.gt 1b \n" |
| : "+r"(src_rgb565), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List |
| ); |
| } |
| |
| #define ARGB1555TOARGB \ |
| /* Input: ARRRRRGGGGGBBBBB */ \ |
| "xtn v29.8b, v0.8h \n" /* xxxBBBBB */ \ |
| "shrn v3.8b, v0.8h, #8 \n" /* Axxxxxxx */ \ |
| "shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \ |
| "shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \ |
| "shl v0.8b, v29.8b, #3 \n" /* BBBBB000 */ \ |
| "sshr v3.8b, v3.8b, #7 \n" /* AAAAAAAA */ \ |
| "sri v2.8b, v2.8b, #5 \n" /* RRRRRRRR */ \ |
| "sri v1.8b, v1.8b, #5 \n" /* GGGGGGGG */ \ |
| "sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */ |
| |
| // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. |
| #define RGB555TOARGB \ |
| /* Input: xRRRRRGGGGGBBBBB */ \ |
| "uzp1 v29.16b, v0.16b, v3.16b \n" /* xxxBBBBB */ \ |
| "shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \ |
| "shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \ |
| "shl v0.16b, v29.16b, #3 \n" /* BBBBB000 */ \ |
| "shrn2 v2.16b, v3.8h, #7 \n" /* RRRRRxxx */ \ |
| "shrn2 v1.16b, v3.8h, #2 \n" /* GGGGGxxx */ \ |
| \ |
| "sri v0.16b, v0.16b, #5 \n" /* BBBBBBBB */ \ |
| "sri v2.16b, v2.16b, #5 \n" /* RRRRRRRR */ \ |
| "sri v1.16b, v1.16b, #5 \n" /* GGGGGGGG */ |
| |
| void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, |
| uint8_t* dst_argb, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
| "prfm pldl1keep, [%0, 448] \n" |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| ARGB1555TOARGB |
| "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB |
| "b.gt 1b \n" |
| : "+r"(src_argb1555), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v29" // Clobber List |
| ); |
| } |
| |
| #define ARGB4444TOARGB \ |
| /* Input: v1.8h = AAAARRRR_GGGGBBBB */ \ |
| "shl v0.16b, v1.16b, #4 \n" /* RRRR0000_BBBB0000 */ \ |
| "sri v1.16b, v1.16b, #4 \n" /* AAAAAAAA_GGGGGGGG */ \ |
| "sri v0.16b, v0.16b, #4 \n" /* RRRRRRRR_BBBBBBBB */ |
| |
| #define ARGB4444TORGB \ |
| /* Input: v0.8h = xxxxRRRRGGGGBBBB */ \ |
| "uzp1 v1.16b, v0.16b, v3.16b \n" /* GGGGBBBB */ \ |
| "shrn v2.8b, v0.8h, #4 \n" /* RRRRxxxx */ \ |
| "shl v0.16b, v1.16b, #4 \n" /* BBBB0000 */ \ |
| "shrn2 v2.16b, v3.8h, #4 \n" /* RRRRxxxx */ \ |
| "sri v1.16b, v1.16b, #4 \n" /* GGGGGGGG */ \ |
| "sri v2.16b, v2.16b, #4 \n" /* RRRRRRRR */ \ |
| "sri v0.16b, v0.16b, #4 \n" /* BBBBBBBB */ |
| |
| void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, |
| uint8_t* dst_argb, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld1 {v1.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB |
| "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 8 ARGB. |
| "b.gt 1b \n" |
| : "+r"(src_argb4444), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List |
| ); |
| } |
| |
| static const int16_t kAR30Row_BoxShifts[] = {0, -6, 0, -6, 0, -6, 0, -6}; |
| |
| static const uint8_t kABGRToAR30Row_BoxIndices[] = { |
| 2, 2, 1, 1, 6, 6, 5, 5, 10, 10, 9, 9, 14, 14, 13, 13, |
| 0, 0, 3, 3, 4, 4, 7, 7, 8, 8, 11, 11, 12, 12, 15, 15}; |
| static const uint8_t kARGBToAR30Row_BoxIndices[] = { |
| 0, 0, 1, 1, 4, 4, 5, 5, 8, 8, 9, 9, 12, 12, 13, 13, |
| 2, 2, 3, 3, 6, 6, 7, 7, 10, 10, 11, 11, 14, 14, 15, 15}; |
| |
| // ARGB or ABGR as input, reordering based on TBL indices parameter. |
| static void ABCDToAR30Row_NEON(const uint8_t* src_abcd, |
| uint8_t* dst_ar30, |
| int width, |
| const uint8_t* indices) { |
| asm volatile ( |
| "movi v2.4s, #0xf, msl 16 \n" // 0xfffff |
| "ldr q3, [%[kAR30Row_BoxShifts]] \n" |
| "ldp q4, q5, [%[indices]] \n" |
| "1: \n" |
| "ldp q0, q20, [%[src]], #32 \n" |
| "subs %w[width], %w[width], #8 \n" |
| "tbl v1.16b, {v0.16b}, v5.16b \n" |
| "tbl v21.16b, {v20.16b}, v5.16b \n" |
| "tbl v0.16b, {v0.16b}, v4.16b \n" |
| "tbl v20.16b, {v20.16b}, v4.16b \n" |
| "ushl v0.8h, v0.8h, v3.8h \n" |
| "ushl v20.8h, v20.8h, v3.8h \n" |
| "ushl v1.8h, v1.8h, v3.8h \n" |
| "ushl v21.8h, v21.8h, v3.8h \n" |
| "ushr v0.4s, v0.4s, #6 \n" |
| "ushr v20.4s, v20.4s, #6 \n" |
| "shl v1.4s, v1.4s, #14 \n" |
| "shl v21.4s, v21.4s, #14 \n" |
| "bif v0.16b, v1.16b, v2.16b \n" |
| "bif v20.16b, v21.16b, v2.16b \n" |
| "stp q0, q20, [%[dst]], #32 \n" |
| "b.gt 1b \n" |
| : [src] "+r"(src_abcd), // %[src] |
| [dst] "+r"(dst_ar30), // %[dst] |
| [width] "+r"(width) // %[width] |
| : [kAR30Row_BoxShifts] "r"(kAR30Row_BoxShifts), // %[kAR30Row_BoxShifts] |
| [indices] "r"(indices) // %[indices] |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20", "v21"); |
| } |
| |
| void ABGRToAR30Row_NEON(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { |
| ABCDToAR30Row_NEON(src_abgr, dst_ar30, width, kABGRToAR30Row_BoxIndices); |
| } |
| |
| void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { |
| ABCDToAR30Row_NEON(src_argb, dst_ar30, width, kARGBToAR30Row_BoxIndices); |
| } |
| |
| void ARGBToRGB24Row_NEON(const uint8_t* src_argb, |
| uint8_t* dst_rgb24, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB |
| "subs %w2, %w2, #16 \n" // 16 pixels per loop. |
| "prfm pldl1keep, [%0, 448] \n" |
| "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" // store 8 RGB24 |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_rgb24), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| ); |
| } |
| |
| void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { |
| asm volatile ( |
| "1: \n" |
| "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "mov v4.8b, v2.8b \n" // mov g |
| "prfm pldl1keep, [%0, 448] \n" |
| "mov v5.8b, v1.8b \n" // mov b |
| "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_raw), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List |
| ); |
| } |
| |
| void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { |
| asm volatile ( |
| "1: \n" |
| "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. |
| "subs %w2, %w2, #16 \n" // 16 processed per loop. |
| "prfm pldl1keep, [%0, 448] \n" |
| "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. |
| "b.gt 1b \n" |
| : "+r"(src_yuy2), // %0 |
| "+r"(dst_y), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1" // Clobber List |
| ); |
| } |
| |
| void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { |
| asm volatile ( |
| "1: \n" |
| "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. |
| "subs %w2, %w2, #16 \n" // 16 processed per loop. |
| "prfm pldl1keep, [%0, 448] \n" |
| "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. |
| "b.gt 1b \n" |
| : "+r"(src_uyvy), // %0 |
| "+r"(dst_y), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1" // Clobber List |
| ); |
| } |
| |
| void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 |
| "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
| "prfm pldl1keep, [%0, 448] \n" |
| "st1 {v1.8b}, [%1], #8 \n" // store 8 U. |
| "st1 {v3.8b}, [%2], #8 \n" // store 8 V. |
| "b.gt 1b \n" |
| : "+r"(src_yuy2), // %0 |
| "+r"(dst_u), // %1 |
| "+r"(dst_v), // %2 |
| "+r"(width) // %3 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| ); |
| } |
| |
| void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY |
| "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
| "prfm pldl1keep, [%0, 448] \n" |
| "st1 {v0.8b}, [%1], #8 \n" // store 8 U. |
| "st1 {v2.8b}, [%2], #8 \n" // store 8 V. |
| "b.gt 1b \n" |
| : "+r"(src_uyvy), // %0 |
| "+r"(dst_u), // %1 |
| "+r"(dst_v), // %2 |
| "+r"(width) // %3 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| ); |
| } |
| |
| void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, |
| int stride_yuy2, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; |
| asm volatile ( |
| "1: \n" |
| "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
| "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
| "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
| "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U |
| "prfm pldl1keep, [%0, 448] \n" |
| "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V |
| "st1 {v1.8b}, [%2], #8 \n" // store 8 U. |
| "st1 {v3.8b}, [%3], #8 \n" // store 8 V. |
| "b.gt 1b \n" |
| : "+r"(src_yuy2), // %0 |
| "+r"(src_yuy2b), // %1 |
| "+r"(dst_u), // %2 |
| "+r"(dst_v), // %3 |
| "+r"(width) // %4 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
| "v7" // Clobber List |
| ); |
| } |
| |
| void UYVYToUVRow_NEON(const uint8_t* src_uyvy, |
| int stride_uyvy, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; |
| asm volatile ( |
| "1: \n" |
| "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
| "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
| "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
| "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U |
| "prfm pldl1keep, [%0, 448] \n" |
| "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V |
| "st1 {v0.8b}, [%2], #8 \n" // store 8 U. |
| "st1 {v2.8b}, [%3], #8 \n" // store 8 V. |
| "b.gt 1b \n" |
| : "+r"(src_uyvy), // %0 |
| "+r"(src_uyvyb), // %1 |
| "+r"(dst_u), // %2 |
| "+r"(dst_v), // %3 |
| "+r"(width) // %4 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
| "v7" // Clobber List |
| ); |
| } |
| |
| void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, |
| int stride_yuy2, |
| uint8_t* dst_uv, |
| int width) { |
| const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; |
| asm volatile ( |
| "1: \n" |
| "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels |
| "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
| "ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row |
| "urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV |
| "prfm pldl1keep, [%0, 448] \n" |
| "st1 {v4.16b}, [%2], #16 \n" // store 8 UV. |
| "b.gt 1b \n" |
| : "+r"(src_yuy2), // %0 |
| "+r"(src_yuy2b), // %1 |
| "+r"(dst_uv), // %2 |
| "+r"(width) // %3 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List |
| ); |
| } |
| |
| // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| void ARGBShuffleRow_NEON(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| const uint8_t* shuffler, |
| int width) { |
| asm volatile ( |
| "ld1 {v2.16b}, [%3] \n" // shuffler |
| "1: \n" |
| "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. |
| "subs %w2, %w2, #4 \n" // 4 processed per loop |
| "prfm pldl1keep, [%0, 448] \n" |
| "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels |
| "st1 {v1.16b}, [%1], #16 \n" // store 4. |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : "r"(shuffler) // %3 |
| : "cc", "memory", "v0", "v1", "v2" // Clobber List |
| ); |
| } |
| |
| void I422ToYUY2Row_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_yuy2, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys |
| "subs %w4, %w4, #16 \n" // 16 pixels |
| "mov v2.8b, v1.8b \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us |
| "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs |
| "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. |
| "b.gt 1b \n" |
| : "+r"(src_y), // %0 |
| "+r"(src_u), // %1 |
| "+r"(src_v), // %2 |
| "+r"(dst_yuy2), // %3 |
| "+r"(width) // %4 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3"); |
| } |
| |
| void I422ToUYVYRow_NEON(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_uyvy, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys |
| "mov v3.8b, v2.8b \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us |
| "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs |
| "subs %w4, %w4, #16 \n" // 16 pixels |
| "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. |
| "b.gt 1b \n" |
| : "+r"(src_y), // %0 |
| "+r"(src_u), // %1 |
| "+r"(src_v), // %2 |
| "+r"(dst_uyvy), // %3 |
| "+r"(width) // %4 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3"); |
| } |
| |
| void ARGBToRGB565Row_NEON(const uint8_t* src_argb, |
| uint8_t* dst_rgb565, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 |
| // pixels |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565 |
| "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_rgb565), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v16", "v17", "v18", "v19"); |
| } |
| |
| void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| uint32_t dither4, |
| int width) { |
| asm volatile ( |
| "dup v1.4s, %w3 \n" // dither4 |
| "1: \n" |
| "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "uqadd v16.8b, v16.8b, v1.8b \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "uqadd v17.8b, v17.8b, v1.8b \n" |
| "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565 |
| "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_rgb), // %1 |
| "+r"(width) // %2 |
| : "r"(dither4) // %3 |
| : "cc", "memory", "v1", "v16", "v17", "v18", "v19"); |
| } |
| |
| void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, |
| uint8_t* dst_argb1555, |
| int width) { |
| asm volatile( |
| "1: \n" |
| "ld2 {v16.8h,v17.8h}, [%0], #32 \n" // load 8 pixels |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 |
| "st1 {v17.16b}, [%1], #16 \n" // store 8 pixels |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_argb1555), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v1", "v2", "v16", "v17"); |
| } |
| |
| void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, |
| uint8_t* dst_argb4444, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 |
| // pixels |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444 |
| "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_argb4444), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19"); |
| } |
| |
| #if defined(LIBYUV_USE_ST2) |
| void ARGBToAR64Row_NEON(const uint8_t* src_argb, |
| uint16_t* dst_ar64, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ldp q0, q2, [%0], #32 \n" // load 8 pixels |
| "mov v1.16b, v0.16b \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "mov v3.16b, v2.16b \n" |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels |
| "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_ar64), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3"); |
| } |
| |
| static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, |
| 10, 9, 8, 11, 14, 13, 12, 15}; |
| |
| void ARGBToAB64Row_NEON(const uint8_t* src_argb, |
| uint16_t* dst_ab64, |
| int width) { |
| asm volatile ( |
| "ldr q4, [%3] \n" // shuffler |
| "1: \n" |
| "ldp q0, q2, [%0], #32 \n" // load 8 pixels |
| "tbl v0.16b, {v0.16b}, v4.16b \n" |
| "tbl v2.16b, {v2.16b}, v4.16b \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "mov v1.16b, v0.16b \n" |
| "mov v3.16b, v2.16b \n" |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels |
| "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_ab64), // %1 |
| "+r"(width) // %2 |
| : "r"(&kShuffleARGBToABGR) // %3 |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); |
| } |
| #else |
| void ARGBToAR64Row_NEON(const uint8_t* src_argb, |
| uint16_t* dst_ar64, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "zip1 v2.16b, v0.16b, v0.16b \n" |
| "zip2 v3.16b, v0.16b, v0.16b \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "zip1 v4.16b, v1.16b, v1.16b \n" |
| "zip2 v5.16b, v1.16b, v1.16b \n" |
| "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64 |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_ar64), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); |
| } |
| |
| static const uvec8 kShuffleARGBToAB64[2] = { |
| {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7}, |
| {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}}; |
| |
| void ARGBToAB64Row_NEON(const uint8_t* src_argb, |
| uint16_t* dst_ab64, |
| int width) { |
| asm volatile ( |
| "ldp q6, q7, [%3] \n" // 2 shufflers |
| "1: \n" |
| "ldp q0, q1, [%0], #32 \n" // load 8 pixels |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64 |
| "tbl v3.16b, {v0.16b}, v7.16b \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "tbl v4.16b, {v1.16b}, v6.16b \n" |
| "tbl v5.16b, {v1.16b}, v7.16b \n" |
| "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64 |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_ab64), // %1 |
| "+r"(width) // %2 |
| : "r"(&kShuffleARGBToAB64[0]) // %3 |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); |
| } |
| #endif // LIBYUV_USE_ST2 |
| |
| static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15, |
| 17, 19, 21, 23, 25, 27, 29, 31}; |
| |
| void AR64ToARGBRow_NEON(const uint16_t* src_ar64, |
| uint8_t* dst_argb, |
| int width) { |
| asm volatile ( |
| "ldr q4, [%3] \n" // shuffler |
| "1: \n" |
| "ldp q0, q1, [%0], #32 \n" // load 4 pixels |
| "ldp q2, q3, [%0], #32 \n" // load 4 pixels |
| "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "stp q0, q2, [%1], #32 \n" // store 8 pixels |
| "b.gt 1b \n" |
| : "+r"(src_ar64), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : "r"(&kShuffleAR64ToARGB) // %3 |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); |
| } |
| |
| static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15, |
| 21, 19, 17, 23, 29, 27, 25, 31}; |
| |
| void AB64ToARGBRow_NEON(const uint16_t* src_ab64, |
| uint8_t* dst_argb, |
| int width) { |
| asm volatile ( |
| "ldr q4, [%3] \n" // shuffler |
| "1: \n" |
| "ldp q0, q1, [%0], #32 \n" // load 4 pixels |
| "ldp q2, q3, [%0], #32 \n" // load 4 pixels |
| "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" |
| "prfm pldl1keep, [%0, 448] \n" |
| "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" |
| "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| "stp q0, q2, [%1], #32 \n" // store 8 pixels |
| "b.gt 1b \n" |
| : "+r"(src_ab64), // %0 |
| "+r"(dst_argb), // %1 |
| "+r"(width) // %2 |
| : "r"(&kShuffleAB64ToARGB) // %3 |
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); |
| } |
| |
| void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, |
| uint8_t* dst_a, |
| int width) { |
| asm volatile ( |
| "1: \n" |
| "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 |
| "prfm pldl1keep, [%0, 448] \n" |
| "subs %w2, %w2, #16 \n" // 16 processed per loop |
| "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. |
| "b.gt 1b \n" |
| : "+r"(src_argb), // %0 |
| "+r"(dst_a), // %1 |
| "+r"(width) // %2 |
| : |
| : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| ); |
| } |
| |
| struct RgbUVConstantsU8 { |
| uint8_t kRGBToU[4]; |
| uint8_t kRGBToV[4]; |
| }; |
| |
| struct RgbUVConstantsI8 { |
| int8_t kRGBToU[4]; |
| int8_t kRGBToV[4]; |
| }; |
| |
| // 8x1 pixels. |
| static void ARGBToUV444MatrixRow_NEON( |
| const uint8_t* src_argb, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width, |
| const struct RgbUVConstantsU8* rgbuvconsta
|