blob: 6d50b27600421b5cb031fd43782d1056922127df [file] [log] [blame]
/*
* Copyright 2014 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer
// STn over ZIP1+ST1
// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// v0.8h: Y
// v1.16b: 8U, 8V
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
"ldr d0, [%[src_y]], #8 \n" \
"ldr s1, [%[src_u]], #4 \n" \
"ldr s2, [%[src_v]], #4 \n" \
"zip1 v0.16b, v0.16b, v0.16b \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"zip1 v1.8b, v1.8b, v1.8b \n" \
"zip1 v2.8b, v2.8b, v2.8b \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n"
// Read 8 Y, 4 U and 4 V from 210
#define READYUV210 \
"ldr q2, [%[src_y]], #16 \n" \
"ldr d1, [%[src_u]], #8 \n" \
"ldr d3, [%[src_v]], #8 \n" \
"shl v0.8h, v2.8h, #6 \n" \
"usra v0.8h, v2.8h, #4 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"zip1 v2.8h, v3.8h, v3.8h \n" \
"zip1 v3.8h, v1.8h, v1.8h \n" \
"uqshrn v1.8b, v3.8h, #2 \n" \
"uqshrn2 v1.16b, v2.8h, #2 \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n"
// Read 8 Y, 4 U and 4 V interleaved from 210
#define READYUVP210 \
"ldr q0, [%[src_y]], #16 \n" \
"ldr q1, [%[src_uv]], #16 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"tbl v1.16b, {v1.16b}, v2.16b \n"
// Read 8 Y, 4 U and 4 V from 212
#define READYUV212 \
"ldr q2, [%[src_y]], #16 \n" \
"ldr d1, [%[src_u]], #8 \n" \
"ldr d3, [%[src_v]], #8 \n" \
"shl v0.8h, v2.8h, #4 \n" \
"usra v0.8h, v2.8h, #8 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"zip1 v2.8h, v3.8h, v3.8h \n" \
"zip1 v3.8h, v1.8h, v1.8h \n" \
"uqshrn v1.8b, v3.8h, #4 \n" \
"uqshrn2 v1.16b, v2.8h, #4 \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n"
// Read 8 Y, 8 U and 8 V from 410
#define READYUV410 \
"ldr q1, [%[src_y]], #16 \n" \
"ldr q2, [%[src_u]], #16 \n" \
"ldr q3, [%[src_v]], #16 \n" \
"shl v0.8h, v1.8h, #6 \n" \
"usra v0.8h, v1.8h, #4 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"uqshrn v1.8b, v2.8h, #2 \n" \
"uqshrn2 v1.16b, v3.8h, #2 \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n"
// Read 8 Y, 8 U and 8 V interleaved from 410
#define READYUVP410 \
"ldr q0, [%[src_y]], #16 \n" \
"ldp q4, q5, [%[src_uv]], #32 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"tbl v1.16b, {v4.16b, v5.16b}, v2.16b \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
"ldr d0, [%[src_y]], #8 \n" \
"ldr d1, [%[src_u]], #8 \n" \
"ldr d2, [%[src_v]], #8 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"prfm pldl1keep, [%[src_u], 448] \n" \
"zip1 v0.16b, v0.16b, v0.16b \n" \
"prfm pldl1keep, [%[src_v], 448] \n"
// Read 8 Y
#define READYUV400 \
"ldr d0, [%[src_y]], #8 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"zip1 v0.16b, v0.16b, v0.16b \n"
static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
1, 1, 3, 3, 5, 5, 7, 7};
static const uvec8 kNV12InterleavedTable = {0, 0, 4, 4, 8, 8, 12, 12,
2, 2, 6, 6, 10, 10, 14, 14};
static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
0, 0, 2, 2, 4, 4, 6, 6};
static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13,
3, 3, 7, 7, 11, 11, 15, 15};
// Read 8 Y and 4 UV from NV12 or NV21
#define READNV12 \
"ldr d0, [%[src_y]], #8 \n" \
"ldr d1, [%[src_uv]], #8 \n" \
"zip1 v0.16b, v0.16b, v0.16b \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"tbl v1.16b, {v1.16b}, v2.16b \n" \
"prfm pldl1keep, [%[src_uv], 448] \n"
// Read 8 YUY2
#define READYUY2 \
"ld1 {v3.16b}, [%[src_yuy2]], #16 \n" \
"trn1 v0.16b, v3.16b, v3.16b \n" \
"prfm pldl1keep, [%[src_yuy2], 448] \n" \
"tbl v1.16b, {v3.16b}, v2.16b \n"
// Read 8 UYVY
#define READUYVY \
"ld1 {v3.16b}, [%[src_uyvy]], #16 \n" \
"trn2 v0.16b, v3.16b, v3.16b \n" \
"prfm pldl1keep, [%[src_uyvy], 448] \n" \
"tbl v1.16b, {v3.16b}, v2.16b \n"
// UB VR UG VG
// YG BB BG BR
#define YUVTORGB_SETUP \
"ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \
"ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n"
// v16.8h: B
// v17.8h: G
// v18.8h: R
// Convert from YUV (NV12 or NV21) to 2.14 fixed point RGB.
// Similar to I4XXTORGB but U/V components are in the low/high halves of v1.
#define NVTORGB \
"umull2 v3.4s, v0.8h, v24.8h \n" \
"umull v6.8h, v1.8b, v30.8b \n" \
"umull v0.4s, v0.4h, v24.4h \n" \
"umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \
"uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \
"umull v4.8h, v1.8b, v28.8b \n" /* DB */ \
"umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \
"add v17.8h, v0.8h, v26.8h \n" /* G */ \
"add v16.8h, v0.8h, v4.8h \n" /* B */ \
"add v18.8h, v0.8h, v5.8h \n" /* R */ \
"uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \
"uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \
"uqsub v18.8h, v18.8h, v27.8h \n" /* R */
// Convert from YUV (I444 or I420) to 2.14 fixed point RGB.
// Similar to NVTORGB but U/V components are in v1/v2.
#define I4XXTORGB \
"umull2 v3.4s, v0.8h, v24.8h \n" \
"umull v6.8h, v1.8b, v30.8b \n" \
"umull v0.4s, v0.4h, v24.4h \n" \
"umlal v6.8h, v2.8b, v31.8b \n" /* DG */ \
"uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \
"umull v4.8h, v1.8b, v28.8b \n" /* DB */ \
"umull v5.8h, v2.8b, v29.8b \n" /* DR */ \
"add v17.8h, v0.8h, v26.8h \n" /* G */ \
"add v16.8h, v0.8h, v4.8h \n" /* B */ \
"add v18.8h, v0.8h, v5.8h \n" /* R */ \
"uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \
"uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \
"uqsub v18.8h, v18.8h, v27.8h \n" /* R */
// Convert from YUV I400 to 2.14 fixed point RGB
#define I400TORGB \
"umull2 v3.4s, v0.8h, v24.8h \n" \
"umull v0.4s, v0.4h, v24.4h \n" \
"uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \
"add v17.8h, v0.8h, v26.8h \n" /* G */ \
"add v16.8h, v0.8h, v4.8h \n" /* B */ \
"add v18.8h, v0.8h, v5.8h \n" /* R */ \
"uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \
"uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \
"uqsub v18.8h, v18.8h, v27.8h \n" /* R */
// Convert from 2.14 fixed point RGB To 8 bit RGB
#define RGBTORGB8 \
"uqshrn v17.8b, v17.8h, #6 \n" \
"uqshrn v16.8b, v16.8h, #6 \n" \
"uqshrn v18.8b, v18.8h, #6 \n"
// Convert from 2.14 fixed point RGB to 8 bit RGB, placing the results in the
// top half of each lane.
#define RGBTORGB8_TOP \
"uqshl v17.8h, v17.8h, #2 \n" \
"uqshl v16.8h, v16.8h, #2 \n" \
"uqshl v18.8h, v18.8h, #2 \n"
// Store 2.14 fixed point RGB as AR30 elements
#define STOREAR30 \
/* Inputs: \
* v16.8h: xxbbbbbbbbbbxxxx \
* v17.8h: xxggggggggggxxxx \
* v18.8h: xxrrrrrrrrrrxxxx \
* v22.8h: 0011111111110000 (umin limit) \
* v23.8h: 1100000000000000 (alpha) \
*/ \
"uqshl v0.8h, v16.8h, #2 \n" /* bbbbbbbbbbxxxxxx */ \
"uqshl v1.8h, v17.8h, #2 \n" /* ggggggggggxxxxxx */ \
"umin v6.8h, v18.8h, v22.8h \n" /* 00rrrrrrrrrrxxxx */ \
"shl v4.8h, v1.8h, #4 \n" /* ggggggxxxxxx0000 */ \
"orr v5.16b, v6.16b, v23.16b \n" /* 11rrrrrrrrrrxxxx */ \
"sri v4.8h, v0.8h, #6 \n" /* ggggggbbbbbbbbbb */ \
"sri v5.8h, v1.8h, #12 \n" /* 11rrrrrrrrrrgggg */ \
"st2 {v4.8h, v5.8h}, [%[dst_ar30]], #32 \n"
#define YUVTORGB_REGS \
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", \
"v25", "v26", "v27", "v28", "v29", "v30", "v31"
void I444ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v19.8b, #255 \n" /* A */
"1: \n" READYUV444 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I444ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n" READYUV444 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS);
}
void I210ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0;
uint16_t alpha = 0xc000;
asm volatile (YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n"
"1: \n" READYUV210 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[alpha] "r"(alpha) // %[alpha]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I410ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0;
uint16_t alpha = 0xc000;
asm volatile (YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n"
"1: \n" READYUV410 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[alpha] "r"(alpha) // %[alpha]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I212ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
const uint16_t limit = 0x3ff0;
asm volatile (
YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"1: \n" READYUV212 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit) // %[limit]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I210ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV210 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I410ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV410 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I212ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
asm volatile (
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV212 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I422ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v19.8b, #255 \n" /* A */
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
uint8_t kP210LoadShuffleIndices[] = {1, 1, 5, 5, 9, 9, 13, 13,
3, 3, 7, 7, 11, 11, 15, 15};
void P210ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kIndices]] \n"
"1: \n" //
READYUVP210 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
uint8_t kP410LoadShuffleIndices[] = {1, 5, 9, 13, 17, 21, 25, 29,
3, 7, 11, 15, 19, 23, 27, 31};
void P410ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kIndices]] \n"
"1: \n" //
READYUVP410 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void P210ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
const uint16_t limit = 0x3ff0;
asm volatile(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"ldr q2, [%[kIndices]] \n"
"1: \n" READYUVP210 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void P410ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0;
asm volatile(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"ldr q2, [%[kIndices]] \n"
"1: \n" READYUVP410 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I422ToAR30Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
const uint16_t limit = 0x3ff0;
asm volatile (
YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"1: \n" READYUV422 I4XXTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit) // %[limit]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n"
"ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[src_a] "+r"(src_a), // %[src_a]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (YUVTORGB_SETUP
"1: \n"
"ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[src_a] "+r"(src_a), // %[src_a]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (YUVTORGB_SETUP
"1: \n"
"ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[src_a] "+r"(src_a), // %[src_a]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n"
"ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[src_a] "+r"(src_a), // %[src_a]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I422ToRGBARow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v15.8b, #255 \n" /* A */
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_rgba] "+r"(dst_rgba), // %[dst_rgba]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v15");
}
void I422ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS);
}
#define ARGBTORGB565 \
/* Inputs: \
* v16: bbbbbxxx \
* v17: ggggggxx \
* v18: rrrrrxxx */ \
"shll v18.8h, v18.8b, #8 \n" /* rrrrrrxx00000000 */ \
"shll v17.8h, v17.8b, #8 \n" /* gggggxxx00000000 */ \
"shll v16.8h, v16.8b, #8 \n" /* bbbbbbxx00000000 */ \
"sri v18.8h, v17.8h, #5 \n" /* rrrrrgggggg00000 */ \
"sri v18.8h, v16.8h, #11 \n" /* rrrrrggggggbbbbb */
#define ARGBTORGB565_FROM_TOP \
/* Inputs: \
* v16: bbbbbxxxxxxxxxxx \
* v17: ggggggxxxxxxxxxx \
* v18: rrrrrxxxxxxxxxxx */ \
"sri v18.8h, v17.8h, #5 \n" /* rrrrrgggggg00000 */ \
"sri v18.8h, v16.8h, #11 \n" /* rrrrrggggggbbbbb */
void I422ToRGB565Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8_TOP
"subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP
"st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS);
}
#define ARGBTOARGB1555 \
/* Inputs: \
* v16: gggggxxxbbbbbxxx v17: axxxxxxxrrrrrxxx */ \
"shl v1.8h, v16.8h, #8 \n" /* bbbbbxxx00000000 */ \
"shl v2.8h, v17.8h, #8 \n" /* rrrrrxxx00000000 */ \
"sri v17.8h, v2.8h, #1 \n" /* arrrrrxxxrrrrxxx */ \
"sri v17.8h, v16.8h, #6 \n" /* arrrrrgggggxxxbb */ \
"sri v17.8h, v1.8h, #11 \n" /* arrrrrgggggbbbbb */
#define ARGBTOARGB1555_FROM_TOP \
/* Inputs: \
* v16: bbbbbxxxxxxxxxxx v17: gggggxxxxxxxxxxx \
* v18: rrrrrxxxxxxxxxxx v19: axxxxxxxxxxxxxxx */ \
"sri v19.8h, v18.8h, #1 \n" /* arrrrrxxxxxxxxxx */ \
"sri v19.8h, v17.8h, #6 \n" /* arrrrrgggggxxxxx */ \
"sri v19.8h, v16.8h, #11 \n" /* arrrrrgggggbbbbb */
void I422ToARGB1555Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (YUVTORGB_SETUP
"movi v19.8h, #0x80, lsl #8 \n"
"1: \n" //
READYUV422 I4XXTORGB RGBTORGB8_TOP
"subs %w[width], %w[width], #8 \n" //
ARGBTOARGB1555_FROM_TOP
"st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels RGB1555.
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
#define ARGBTOARGB4444 \
/* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A */ \
"sri v17.8b, v16.8b, #4 \n" /* BG */ \
"sri v19.8b, v18.8b, #4 \n" /* RA */ \
"zip1 v0.16b, v17.16b, v19.16b \n" /* BGRA */
void I422ToARGB4444Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"movi v19.8b, #255 \n" ARGBTOARGB4444
"st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8
// pixels
// ARGB4444.
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I400ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v1.16b, #128 \n"
"movi v19.8b, #255 \n"
"umull v6.8h, v1.8b, v30.8b \n"
"umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */
"umull v4.8h, v1.8b, v28.8b \n" /* DB */
"umull2 v5.8h, v1.16b, v29.16b \n" /* DR */
"1: \n" READYUV400 I400TORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
#if defined(LIBYUV_USE_ST4)
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile (
"movi v23.8b, #255 \n"
"1: \n"
"ld1 {v20.8b}, [%0], #8 \n"
"prfm pldl1keep, [%0, 448] \n"
"mov v21.8b, v20.8b \n"
"mov v22.8b, v20.8b \n"
"subs %w2, %w2, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v20", "v21", "v22", "v23");
}
#else
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile (
"movi v20.8b, #255 \n"
"1: \n"
"ldr d16, [%0], #8 \n"
"subs %w2, %w2, #8 \n"
"zip1 v18.16b, v16.16b, v16.16b \n" // YY
"zip1 v19.16b, v16.16b, v20.16b \n" // YA
"prfm pldl1keep, [%0, 448] \n"
"zip1 v16.16b, v18.16b, v19.16b \n" // YYYA
"zip2 v17.16b, v18.16b, v19.16b \n"
"stp q16, q17, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v16", "v17", "v18", "v19", "v20");
}
#endif // LIBYUV_USE_ST4
void NV12ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[kNV12Table] "r"(&kNV12Table)
: "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}
void NV21ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_vu), // %[src_uv]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[kNV12Table] "r"(&kNV21Table)
: "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}
void NV12ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[kNV12Table] "r"(&kNV12Table)
: "cc", "memory", YUVTORGB_REGS, "v2");
}
void NV21ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_vu), // %[src_uv]
[dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[kNV12Table] "r"(&kNV21Table)
: "cc", "memory", YUVTORGB_REGS, "v2");
}
void NV12ToRGB565Row_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 NVTORGB
RGBTORGB8_TOP
"subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP
"st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8
// pixels
// RGB565.
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[kNV12Table] "r"(&kNV12Table)
: "cc", "memory", YUVTORGB_REGS, "v2");
}
void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV21InterleavedTable]] \n"
"1: \n" READYUY2 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[kNV21InterleavedTable] "r"(&kNV21InterleavedTable)
: "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}
void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12InterleavedTable]] \n"
"1: \n" READUYVY NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_uyvy] "+r"(src_uyvy), // %[src_yuy2]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[kNV12InterleavedTable] "r"(&kNV12InterleavedTable)
: "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
void SplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %w3, %w3, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"st1 {v0.16b}, [%1], #16 \n" // store U
"st1 {v1.16b}, [%2], #16 \n" // store V
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
);
}
// Reads 16 byte Y's from tile and writes out 16 Y's.
// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
// width measured in bytes so 8 UV = 16.
void DetileRow_NEON(const uint8_t* src,
ptrdiff_t src_tile_stride,
uint8_t* dst,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes
"subs %w2, %w2, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 1792] \n" // 7 tiles of 256b ahead
"st1 {v0.16b}, [%1], #16 \n" // store 16 bytes
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(src_tile_stride) // %3
: "cc", "memory", "v0" // Clobber List
);
}
// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
void DetileRow_16_NEON(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels
"subs %w2, %w2, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 3584] \n" // 7 tiles of 512b ahead
"st1 {v0.8h,v1.8h}, [%1], #32 \n" // store 16 pixels
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(src_tile_stride * 2) // %3
: "cc", "memory", "v0", "v1" // Clobber List
);
}
// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
ptrdiff_t src_tile_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"1: \n"
"ld2 {v0.8b,v1.8b}, [%0], %4 \n"
"subs %w3, %w3, #16 \n"
"prfm pldl1keep, [%0, 1792] \n"
"st1 {v0.8b}, [%1], #8 \n"
"st1 {v1.8b}, [%2], #8 \n"
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"(src_tile_stride) // %4
: "cc", "memory", "v0", "v1" // Clobber List
);
}
#if defined(LIBYUV_USE_ST2)
// Read 16 Y, 8 UV, and write 8 YUY2
void DetileToYUY2_NEON(const uint8_t* src_y,
ptrdiff_t src_y_tile_stride,
const uint8_t* src_uv,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys
"prfm pldl1keep, [%0, 1792] \n"
"ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs
"prfm pldl1keep, [%1, 1792] \n"
"subs %w3, %w3, #16 \n" // store 8 YUY2
"st2 {v0.16b,v1.16b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_yuy2), // %2
"+r"(width) // %3
: "r"(src_y_tile_stride), // %4
"r"(src_uv_tile_stride) // %5
: "cc", "memory", "v0", "v1" // Clobber list
);
}
#else
// Read 16 Y, 8 UV, and write 8 YUY2
void DetileToYUY2_NEON(const uint8_t* src_y,
ptrdiff_t src_y_tile_stride,
const uint8_t* src_uv,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys
"ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs
"subs %w3, %w3, #16 \n"
"prfm pldl1keep, [%0, 1792] \n"
"zip1 v2.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%1, 1792] \n"
"zip2 v3.16b, v0.16b, v1.16b \n"
"st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 8 YUY2
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_yuy2), // %2
"+r"(width) // %3
: "r"(src_y_tile_stride), // %4
"r"(src_uv_tile_stride) // %5
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber list
);
}
#endif
// Unpack MT2T into tiled P010 64 pixels at a time. See
// tinyurl.com/mtk-10bit-video-format for format documentation.
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
asm volatile (
"1: \n"
"ld1 {v7.16b}, [%0], #16 \n"
"ld1 {v0.16b-v3.16b}, [%0], #64 \n"
"shl v4.16b, v7.16b, #6 \n"
"shl v5.16b, v7.16b, #4 \n"
"shl v6.16b, v7.16b, #2 \n"
"subs %2, %2, #80 \n"
"zip1 v16.16b, v4.16b, v0.16b \n"
"zip1 v18.16b, v5.16b, v1.16b \n"
"zip1 v20.16b, v6.16b, v2.16b \n"
"zip1 v22.16b, v7.16b, v3.16b \n"
"zip2 v17.16b, v4.16b, v0.16b \n"
"zip2 v19.16b, v5.16b, v1.16b \n"
"zip2 v21.16b, v6.16b, v2.16b \n"
"zip2 v23.16b, v7.16b, v3.16b \n"
"sri v16.8h, v16.8h, #10 \n"
"sri v17.8h, v17.8h, #10 \n"
"sri v18.8h, v18.8h, #10 \n"
"sri v19.8h, v19.8h, #10 \n"
"st1 {v16.8h-v19.8h}, [%1], #64 \n"
"sri v20.8h, v20.8h, #10 \n"
"sri v21.8h, v21.8h, #10 \n"
"sri v22.8h, v22.8h, #10 \n"
"sri v23.8h, v23.8h, #10 \n"
"st1 {v20.8h-v23.8h}, [%1], #64 \n"
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(size) // %2
:
: "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
}
#if defined(LIBYUV_USE_ST2)
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load U
"ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %w3, %w3, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
);
}
void MergeUVRow_16_NEON(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
int depth,
int width) {
int shift = 16 - depth;
asm volatile (
"dup v2.8h, %w4 \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
"ushl v0.8h, v0.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"ushl v1.8h, v1.8h, v2.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"(shift) // %4
: "cc", "memory", "v0", "v1", "v2");
}
#else
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load U
"ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %w3, %w3, #16 \n" // 16 processed per loop
"zip1 v2.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"zip2 v3.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
void MergeUVRow_16_NEON(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
int depth,
int width) {
int shift = 16 - depth;
asm volatile (
"dup v4.8h, %w4 \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
"ushl v0.8h, v0.8h, v4.8h \n"
"ushl v1.8h, v1.8h, v4.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"zip1 v2.8h, v0.8h, v1.8h \n"
"zip2 v3.8h, v0.8h, v1.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"(shift) // %4
: "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4");
}
#endif // LIBYUV_USE_ST2
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
asm volatile (
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
"subs %w4, %w4, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"st1 {v0.16b}, [%1], #16 \n" // store R
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%3], #16 \n" // store B
"b.gt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
void MergeRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_rgb,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load R
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v2.16b}, [%2], #16 \n" // load B
"subs %w4, %w4, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_rgb), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
void SplitARGBRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width) {
asm volatile (
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
"subs %w5, %w5, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"st1 {v0.16b}, [%3], #16 \n" // store B
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%1], #16 \n" // store R
"st1 {v3.16b}, [%4], #16 \n" // store A
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(dst_a), // %4
"+r"(width) // %5
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#if defined(LIBYUV_USE_ST4)
// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%2], #16 \n" // load B
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v2.16b}, [%0], #16 \n" // load R
"ld1 {v3.16b}, [%3], #16 \n" // load A
"subs %w5, %w5, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%3, 448] \n"
"st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
"+r"(width) // %5
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#else
// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%2], #16 \n" // load B
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v2.16b}, [%0], #16 \n" // load R
"ld1 {v3.16b}, [%3], #16 \n" // load A
"subs %w5, %w5, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%2, 448] \n"
"zip1 v4.16b, v0.16b, v1.16b \n" // BG
"zip1 v5.16b, v2.16b, v3.16b \n" // RA
"prfm pldl1keep, [%1, 448] \n"
"zip2 v6.16b, v0.16b, v1.16b \n" // BG
"zip2 v7.16b, v2.16b, v3.16b \n" // RA
"prfm pldl1keep, [%0, 448] \n"
"zip1 v0.8h, v4.8h, v5.8h \n" // BGRA
"zip2 v1.8h, v4.8h, v5.8h \n"
"prfm pldl1keep, [%3, 448] \n"
"zip1 v2.8h, v6.8h, v7.8h \n"
"zip2 v3.8h, v6.8h, v7.8h \n"
"st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
"+r"(width) // %5
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7" // Clobber List
);
}
#endif // LIBYUV_USE_ST4
// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
void SplitXRGBRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
asm volatile (
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
"subs %w4, %w4, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"st1 {v0.16b}, [%3], #16 \n" // store B
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%1], #16 \n" // store R
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
void MergeXRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width) {
asm volatile (
"movi v3.16b, #255 \n" // load A(255)
"1: \n"
"ld1 {v2.16b}, [%0], #16 \n" // load R
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v0.16b}, [%2], #16 \n" // load B
"subs %w4, %w4, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
void MergeXR30Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width) {
int shift = 10 - depth;
asm volatile (
"movi v30.16b, #255 \n"
"ushr v30.4s, v30.4s, #22 \n" // 1023
"dup v31.4s, %w5 \n"
"1: \n"
"ldr d2, [%2], #8 \n" // B
"ldr d1, [%1], #8 \n" // G
"ldr d0, [%0], #8 \n" // R
"ushll v2.4s, v2.4h, #0 \n" // B
"ushll v1.4s, v1.4h, #0 \n" // G
"ushll v0.4s, v0.4h, #0 \n" // R
"ushl v2.4s, v2.4s, v31.4s \n" // 000B
"ushl v1.4s, v1.4s, v31.4s \n" // G
"ushl v0.4s, v0.4s, v31.4s \n" // R
"umin v2.4s, v2.4s, v30.4s \n"
"umin v1.4s, v1.4s, v30.4s \n"
"umin v0.4s, v0.4s, v30.4s \n"
"sli v2.4s, v1.4s, #10 \n" // 00GB
"sli v2.4s, v0.4s, #20 \n" // 0RGB
"orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
"subs %w4, %w4, #4 \n"
"str q2, [%3], #16 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar30), // %3
"+r"(width) // %4
: "r"(shift) // %5
: "memory", "cc", "v0", "v1", "v2", "v30", "v31");
}
void MergeXR30Row_10_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int /* depth */,
int width) {
// Neon has no "shift left and accumulate/orr", so use a multiply-add to
// perform the shift instead.
int limit = 1023;
asm volatile (
"dup v5.8h, %w[limit] \n"
"movi v6.8h, #16 \n" // 1 << 4
"movi v7.8h, #4, lsl #8 \n" // 1 << 10
"1: \n"
"ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr
"ldr q1, [%1], #16 \n" // xxxxxxGggggggggg
"ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb
"umin v0.8h, v0.8h, v5.8h \n" // 000000Rrrrrrrrrr
"umin v1.8h, v1.8h, v5.8h \n" // 000000Gggggggggg
"movi v4.8h, #0xc0, lsl #8 \n" // 1100000000000000
"umin v3.8h, v2.8h, v5.8h \n" // 000000Bbbbbbbbbb
"mla v4.8h, v0.8h, v6.8h \n" // 11Rrrrrrrrrr0000
"mla v3.8h, v1.8h, v7.8h \n" // ggggggBbbbbbbbbb
"usra v4.8h, v1.8h, #6 \n" // 11RrrrrrrrrrGggg
"subs %w4, %w4, #8 \n"
"st2 {v3.8h, v4.8h}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar30), // %3
"+r"(width) // %4
: [limit] "r"(limit)
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
void MergeAR64Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
int depth,
int width) {
int shift = 16 - depth;
int mask = (1 << depth) - 1;
asm volatile (
"dup v30.8h, %w7 \n"
"dup v31.8h, %w6 \n"
"1: \n"
"ldr q2, [%0], #16 \n" // R
"ldr q1, [%1], #16 \n" // G
"ldr q0, [%2], #16 \n" // B
"ldr q3, [%3], #16 \n" // A
"umin v2.8h, v2.8h, v30.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"umin v1.8h, v1.8h, v30.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"umin v0.8h, v0.8h, v30.8h \n"
"prfm pldl1keep, [%2, 448] \n"
"umin v3.8h, v3.8h, v30.8h \n"
"prfm pldl1keep, [%3, 448] \n"
"ushl v2.8h, v2.8h, v31.8h \n"
"ushl v1.8h, v1.8h, v31.8h \n"
"ushl v0.8h, v0.8h, v31.8h \n"
"ushl v3.8h, v3.8h, v31.8h \n"
"subs %w5, %w5, #8 \n"
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_ar64), // %4
"+r"(width) // %5
: "r"(shift), // %6
"r"(mask) // %7
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
}
void MergeXR64Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
int depth,
int width) {
int shift = 16 - depth;
int mask = (1 << depth) - 1;
asm volatile (
"movi v3.16b, #0xff \n" // A (0xffff)
"dup v30.8h, %w6 \n"
"dup v31.8h, %w5 \n"
"1: \n"
"ldr q2, [%0], #16 \n" // R
"ldr q1, [%1], #16 \n" // G
"ldr q0, [%2], #16 \n" // B
"umin v2.8h, v2.8h, v30.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"umin v1.8h, v1.8h, v30.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"umin v0.8h, v0.8h, v30.8h \n"
"prfm pldl1keep, [%2, 448] \n"
"ushl v2.8h, v2.8h, v31.8h \n"
"ushl v1.8h, v1.8h, v31.8h \n"
"ushl v0.8h, v0.8h, v31.8h \n"
"subs %w4, %w4, #8 \n"
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar64), // %3
"+r"(width) // %4
: "r"(shift), // %5
"r"(mask) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
}
void MergeARGB16To8Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
int depth,
int width) {
// Shift is 8 - depth, +8 so the result is in the top half of each lane.
int shift = 16 - depth;
asm volatile (
"dup v31.8h, %w6 \n"
"1: \n"
"ldr q0, [%0], #16 \n" // B
"ldr q1, [%1], #16 \n" // G
"ldr q2, [%2], #16 \n" // R
"ldr q3, [%3], #16 \n" // A
"uqshl v0.8h, v0.8h, v31.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"uqshl v1.8h, v1.8h, v31.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"uqshl v2.8h, v2.8h, v31.8h \n"
"prfm pldl1keep, [%2, 448] \n"
"uqshl v3.8h, v3.8h, v31.8h \n"
"prfm pldl1keep, [%3, 448] \n"
"trn2 v0.16b, v0.16b, v1.16b \n"
"trn2 v1.16b, v2.16b, v3.16b \n"
"subs %w5, %w5, #8 \n"
"st2 {v0.8h, v1.8h}, [%4], #32 \n"
"b.gt 1b \n"
: "+r"(src_b), // %0
"+r"(src_g), // %1
"+r"(src_r), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
"+r"(width) // %5
: "r"(shift) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
}
void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
int depth,
int width) {
// Shift is 8 - depth, +8 so the result is in the top half of each lane.
int shift = 16 - depth;
asm volatile (
"dup v31.8h, %w5 \n"
"movi v3.16b, #0xff \n" // A (0xff)
"1: \n"
"ldr q0, [%0], #16 \n" // B
"ldr q1, [%1], #16 \n" // G
"ldr q2, [%2], #16 \n" // R
"uqshl v0.8h, v0.8h, v31.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"uqshl v1.8h, v1.8h, v31.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"uqshl v2.8h, v2.8h, v31.8h \n"
"prfm pldl1keep, [%2, 448] \n"
"trn2 v0.16b, v0.16b, v1.16b \n"
"trn2 v1.16b, v2.16b, v3.16b \n"
"subs %w4, %w4, #8 \n"
"st2 {v0.8h, v1.8h}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_b), // %0
"+r"(src_g), // %1
"+r"(src_r), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
: "r"(shift) // %5
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
}
// Copy multiple of 32.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile (
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #32 \n" // 32 processed per loop
"stp q0, q1, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
);
}
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile (
"dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
"subs %w1, %w1, #16 \n" // 16 bytes per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v8) // %2
: "cc", "memory", "v0");
}
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile (
"dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
"subs %w1, %w1, #4 \n" // 4 ints per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v32) // %2
: "cc", "memory", "v0");
}
// Shuffle table for reversing the bytes.
static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile (
// Start at end of source row.
"ld1 {v3.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw \n"
"sub %0, %0, #32 \n"
"1: \n"
"ldr q2, [%0, 16] \n"
"ldr q1, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #32 \n" // 32 pixels per loop.
"tbl v0.16b, {v2.16b}, v3.16b \n"
"tbl v1.16b, {v1.16b}, v3.16b \n"
"st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(&kShuffleMirror) // %3
: "cc", "memory", "v0", "v1", "v2", "v3");
}
// Shuffle table for reversing the UV.
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile (
// Start at end of source row.
"ld1 {v4.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw #1 \n"
"sub %0, %0, #32 \n"
"1: \n"
"ldr q1, [%0, 16] \n"
"ldr q0, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
"tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v3.16b, {v0.16b}, v4.16b \n"
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
: "r"(&kShuffleMirrorUV) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
// Start at end of source row.
"ld1 {v4.16b}, [%4] \n" // shuffler
"add %0, %0, %w3, sxtw #1 \n"
"sub %0, %0, #32 \n"
"1: \n"
"ldr q1, [%0, 16] \n"
"ldr q0, [%0], -32 \n" // src -= 32
"subs %w3, %w3, #16 \n" // 16 pixels per loop.
"tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v3.16b, {v0.16b}, v4.16b \n"
"uzp1 v0.16b, v2.16b, v3.16b \n" // U
"uzp2 v1.16b, v2.16b, v3.16b \n" // V
"st1 {v0.16b}, [%1], #16 \n" // dst += 16
"st1 {v1.16b}, [%2], #16 \n"
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"(&kShuffleMirrorUV) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
// Shuffle table for reversing the ARGB.
static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile (
// Start at end of source row.
"ld1 {v4.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw #2 \n"
"sub %0, %0, #32 \n"
"1: \n"
"ldr q1, [%0, 16] \n"
"ldr q0, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #8 \n" // 8 pixels per loop.
"tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v3.16b, {v0.16b}, v4.16b \n"
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(&kShuffleMirrorARGB) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_rgb24,
int width) {
asm volatile (
"ld1 {v3.16b}, [%4] \n" // shuffler
"add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
"add %0, %0, %w2, sxtw \n"
"sub %0, %0, #48 \n"
"1: \n"
"ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
"tbl v0.16b, {v0.16b}, v3.16b \n"
"tbl v1.16b, {v1.16b}, v3.16b \n"
"tbl v2.16b, {v2.16b}, v3.16b \n"
"st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
: "r"((ptrdiff_t)-48), // %3
"r"(&kShuffleMirror) // %4
: "cc", "memory", "v0", "v1", "v2", "v3");
}
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile (
"movi v4.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
// RGB24.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile (
"movi v5.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"mov v3.8b, v1.8b \n" // move g
"prfm pldl1keep, [%0, 448] \n"
"mov v4.8b, v0.8b \n" // move r
"st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile (
"movi v0.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"mov v2.8b, v4.8b \n" // move g
"prfm pldl1keep, [%0, 448] \n"
"mov v1.8b, v5.8b \n" // move r
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile (
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"mov v3.8b, v1.8b \n" // move g
"prfm pldl1keep, [%0, 448] \n"
"mov v4.8b, v0.8b \n" // move r
"st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
);
}
#define RGB565TOARGB \
/* Input: v0/v4.8h: RRRRRGGGGGGBBBBB */ \
"shrn v1.8b, v0.8h, #3 \n" /* G GGGGGGxx */ \
"shrn2 v1.16b, v4.8h, #3 \n" /* G GGGGGGxx */ \
"uzp2 v2.16b, v0.16b, v4.16b \n" /* R RRRRRxxx */ \
"uzp1 v0.16b, v0.16b, v4.16b \n" /* B xxxBBBBB */ \
"sri v1.16b, v1.16b, #6 \n" /* G GGGGGGGG, fill 2 */ \
"shl v0.16b, v0.16b, #3 \n" /* B BBBBB000 */ \
"sri v2.16b, v2.16b, #5 \n" /* R RRRRRRRR, fill 3 */ \
"sri v0.16b, v0.16b, #5 \n" /* R BBBBBBBB, fill 3 */
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile(
"movi v3.16b, #255 \n" // Alpha
"1: \n"
"ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels
"subs %w2, %w2, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n" RGB565TOARGB
"st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB
"add %1, %1, #64 \n"
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
);
}
#define ARGB1555TOARGB \
/* Input: ARRRRRGGGGGBBBBB */ \
"xtn v29.8b, v0.8h \n" /* xxxBBBBB */ \
"shrn v3.8b, v0.8h, #8 \n" /* Axxxxxxx */ \
"shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \
"shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \
"shl v0.8b, v29.8b, #3 \n" /* BBBBB000 */ \
"sshr v3.8b, v3.8b, #7 \n" /* AAAAAAAA */ \
"sri v2.8b, v2.8b, #5 \n" /* RRRRRRRR */ \
"sri v1.8b, v1.8b, #5 \n" /* GGGGGGGG */ \
"sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
#define RGB555TOARGB \
/* Input: xRRRRRGGGGGBBBBB */ \
"uzp1 v29.16b, v0.16b, v3.16b \n" /* xxxBBBBB */ \
"shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \
"shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \
"shl v0.16b, v29.16b, #3 \n" /* BBBBB000 */ \
"shrn2 v2.16b, v3.8h, #7 \n" /* RRRRRxxx */ \
"shrn2 v1.16b, v3.8h, #2 \n" /* GGGGGxxx */ \
\
"sri v0.16b, v0.16b, #5 \n" /* BBBBBBBB */ \
"sri v2.16b, v2.16b, #5 \n" /* RRRRRRRR */ \
"sri v1.16b, v1.16b, #5 \n" /* GGGGGGGG */
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v29" // Clobber List
);
}
#define ARGB4444TOARGB \
/* Input: v1.8h = AAAARRRR_GGGGBBBB */ \
"shl v0.16b, v1.16b, #4 \n" /* RRRR0000_BBBB0000 */ \
"sri v1.16b, v1.16b, #4 \n" /* AAAAAAAA_GGGGGGGG */ \
"sri v0.16b, v0.16b, #4 \n" /* RRRRRRRR_BBBBBBBB */
#define ARGB4444TORGB \
/* Input: v0.8h = xxxxRRRRGGGGBBBB */ \
"uzp1 v1.16b, v0.16b, v3.16b \n" /* GGGGBBBB */ \
"shrn v2.8b, v0.8h, #4 \n" /* RRRRxxxx */ \
"shl v0.16b, v1.16b, #4 \n" /* BBBB0000 */ \
"shrn2 v2.16b, v3.8h, #4 \n" /* RRRRxxxx */ \
"sri v1.16b, v1.16b, #4 \n" /* GGGGGGGG */ \
"sri v2.16b, v2.16b, #4 \n" /* RRRRRRRR */ \
"sri v0.16b, v0.16b, #4 \n" /* BBBBBBBB */
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width) {
asm volatile (
"1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB
"st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 8 ARGB.
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
);
}
static const int16_t kAR30Row_BoxShifts[] = {0, -6, 0, -6, 0, -6, 0, -6};
static const uint8_t kABGRToAR30Row_BoxIndices[] = {
2, 2, 1, 1, 6, 6, 5, 5, 10, 10, 9, 9, 14, 14, 13, 13,
0, 0, 3, 3, 4, 4, 7, 7, 8, 8, 11, 11, 12, 12, 15, 15};
static const uint8_t kARGBToAR30Row_BoxIndices[] = {
0, 0, 1, 1, 4, 4, 5, 5, 8, 8, 9, 9, 12, 12, 13, 13,
2, 2, 3, 3, 6, 6, 7, 7, 10, 10, 11, 11, 14, 14, 15, 15};
// ARGB or ABGR as input, reordering based on TBL indices parameter.
static void ABCDToAR30Row_NEON(const uint8_t* src_abcd,
uint8_t* dst_ar30,
int width,
const uint8_t* indices) {
asm volatile (
"movi v2.4s, #0xf, msl 16 \n" // 0xfffff
"ldr q3, [%[kAR30Row_BoxShifts]] \n"
"ldp q4, q5, [%[indices]] \n"
"1: \n"
"ldp q0, q20, [%[src]], #32 \n"
"subs %w[width], %w[width], #8 \n"
"tbl v1.16b, {v0.16b}, v5.16b \n"
"tbl v21.16b, {v20.16b}, v5.16b \n"
"tbl v0.16b, {v0.16b}, v4.16b \n"
"tbl v20.16b, {v20.16b}, v4.16b \n"
"ushl v0.8h, v0.8h, v3.8h \n"
"ushl v20.8h, v20.8h, v3.8h \n"
"ushl v1.8h, v1.8h, v3.8h \n"
"ushl v21.8h, v21.8h, v3.8h \n"
"ushr v0.4s, v0.4s, #6 \n"
"ushr v20.4s, v20.4s, #6 \n"
"shl v1.4s, v1.4s, #14 \n"
"shl v21.4s, v21.4s, #14 \n"
"bif v0.16b, v1.16b, v2.16b \n"
"bif v20.16b, v21.16b, v2.16b \n"
"stp q0, q20, [%[dst]], #32 \n"
"b.gt 1b \n"
: [src] "+r"(src_abcd), // %[src]
[dst] "+r"(dst_ar30), // %[dst]
[width] "+r"(width) // %[width]
: [kAR30Row_BoxShifts] "r"(kAR30Row_BoxShifts), // %[kAR30Row_BoxShifts]
[indices] "r"(indices) // %[indices]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20", "v21");
}
void ABGRToAR30Row_NEON(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
ABCDToAR30Row_NEON(src_abgr, dst_ar30, width, kABGRToAR30Row_BoxIndices);
}
void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
ABCDToAR30Row_NEON(src_argb, dst_ar30, width, kARGBToAR30Row_BoxIndices);
}
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb24,
int width) {
asm volatile (
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
"prfm pldl1keep, [%0, 448] \n"
"st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" // store 8 RGB24
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile (
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"mov v4.8b, v2.8b \n" // mov g
"prfm pldl1keep, [%0, 448] \n"
"mov v5.8b, v1.8b \n" // mov b
"st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile (
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"prfm pldl1keep, [%0, 448] \n"
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1" // Clobber List
);
}
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile (
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"prfm pldl1keep, [%0, 448] \n"
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1" // Clobber List
);
}
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"prfm pldl1keep, [%0, 448] \n"
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"prfm pldl1keep, [%0, 448] \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile (
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
"prfm pldl1keep, [%0, 448] \n"
"urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
"st1 {v1.8b}, [%2], #8 \n" // store 8 U.
"st1 {v3.8b}, [%3], #8 \n" // store 8 V.
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(src_yuy2b), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7" // Clobber List
);
}
void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
int stride_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile (
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
"prfm pldl1keep, [%0, 448] \n"
"urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
"st1 {v0.8b}, [%2], #8 \n" // store 8 U.
"st1 {v2.8b}, [%3], #8 \n" // store 8 V.
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(src_uyvyb), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7" // Clobber List
);
}
void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width) {
const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile (
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row
"urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV
"prfm pldl1keep, [%0, 448] \n"
"st1 {v4.16b}, [%2], #16 \n" // store 8 UV.
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(src_yuy2b), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
);
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
const uint8_t* shuffler,
int width) {
asm volatile (
"ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %w2, %w2, #4 \n" // 4 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
"st1 {v1.16b}, [%1], #16 \n" // store 4.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(shuffler) // %3
: "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
void I422ToYUY2Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_yuy2,
int width) {
asm volatile (
"1: \n"
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
"subs %w4, %w4, #16 \n" // 16 pixels
"mov v2.8b, v1.8b \n"
"prfm pldl1keep, [%0, 448] \n"
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_yuy2), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3");
}
void I422ToUYVYRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uyvy,
int width) {
asm volatile (
"1: \n"
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
"mov v3.8b, v2.8b \n"
"prfm pldl1keep, [%0, 448] \n"
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %w4, %w4, #16 \n" // 16 pixels
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_uyvy), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3");
}
void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb565,
int width) {
asm volatile (
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n" ARGBTORGB565
"st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
"+r"(width) // %2
:
: "cc", "memory", "v16", "v17", "v18", "v19");
}
void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb,
uint32_t dither4,
int width) {
asm volatile (
"dup v1.4s, %w3 \n" // dither4
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uqadd v16.8b, v16.8b, v1.8b \n"
"prfm pldl1keep, [%0, 448] \n"
"uqadd v17.8b, v17.8b, v1.8b \n"
"uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565
"st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb), // %1
"+r"(width) // %2
: "r"(dither4) // %3
: "cc", "memory", "v1", "v16", "v17", "v18", "v19");
}
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb1555,
int width) {
asm volatile(
"1: \n"
"ld2 {v16.8h,v17.8h}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555
"st1 {v17.16b}, [%1], #16 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v16", "v17");
}
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile (
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19");
}
#if defined(LIBYUV_USE_ST2)
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ar64,
int width) {
asm volatile (
"1: \n"
"ldp q0, q2, [%0], #32 \n" // load 8 pixels
"mov v1.16b, v0.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"mov v3.16b, v2.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
"st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_ar64), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3");
}
static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
10, 9, 8, 11, 14, 13, 12, 15};
void ARGBToAB64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ab64,
int width) {
asm volatile (
"ldr q4, [%3] \n" // shuffler
"1: \n"
"ldp q0, q2, [%0], #32 \n" // load 8 pixels
"tbl v0.16b, {v0.16b}, v4.16b \n"
"tbl v2.16b, {v2.16b}, v4.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"mov v1.16b, v0.16b \n"
"mov v3.16b, v2.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
"st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_ab64), // %1
"+r"(width) // %2
: "r"(&kShuffleARGBToABGR) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
#else
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ar64,
int width) {
asm volatile (
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"zip1 v2.16b, v0.16b, v0.16b \n"
"zip2 v3.16b, v0.16b, v0.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"zip1 v4.16b, v1.16b, v1.16b \n"
"zip2 v5.16b, v1.16b, v1.16b \n"
"st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_ar64), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
}
static const uvec8 kShuffleARGBToAB64[2] = {
{2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7},
{10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}};
void ARGBToAB64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ab64,
int width) {
asm volatile (
"ldp q6, q7, [%3] \n" // 2 shufflers
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64
"tbl v3.16b, {v0.16b}, v7.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"tbl v4.16b, {v1.16b}, v6.16b \n"
"tbl v5.16b, {v1.16b}, v7.16b \n"
"st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_ab64), // %1
"+r"(width) // %2
: "r"(&kShuffleARGBToAB64[0]) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
#endif // LIBYUV_USE_ST2
static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15,
17, 19, 21, 23, 25, 27, 29, 31};
void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
uint8_t* dst_argb,
int width) {
asm volatile (
"ldr q4, [%3] \n" // shuffler
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 4 pixels
"ldp q2, q3, [%0], #32 \n" // load 4 pixels
"tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"stp q0, q2, [%1], #32 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_ar64), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(&kShuffleAR64ToARGB) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15,
21, 19, 17, 23, 29, 27, 25, 31};
void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
uint8_t* dst_argb,
int width) {
asm volatile (
"ldr q4, [%3] \n" // shuffler
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 4 pixels
"ldp q2, q3, [%0], #32 \n" // load 4 pixels
"tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"stp q0, q2, [%1], #32 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_ab64), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(&kShuffleAB64ToARGB) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
uint8_t* dst_a,
int width) {
asm volatile (
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop
"st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
struct RgbUVConstantsU8 {
uint8_t kRGBToU[4];
uint8_t kRGBToV[4];
};
struct RgbUVConstantsI8 {
int8_t kRGBToU[4];
int8_t kRGBToV[4];
};
// 8x1 pixels.
static void ARGBToUV444MatrixRow_NEON(
const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstantsU8* rgbuvconsta