blob: 85dd2fb9df2139cba3e0ad84981e095767700c72 [file] [log] [blame]
@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ This file contains some minimum and maximum functions, optimized for
@ ARM Neon platform. The description header can be found in
@ signal_processing_library.h
@
@ The reference C code is in file min_max_operations.c. Code here is basically
@ a loop unrolling by 8 with Neon instructions. Bit-exact.
.arch armv7-a
.fpu neon
.global WebRtcSpl_MaxAbsValueW16Neon
.global WebRtcSpl_MaxAbsValueW32Neon
.global WebRtcSpl_MaxValueW16Neon
.global WebRtcSpl_MaxValueW32Neon
.global WebRtcSpl_MinValueW16Neon
.global WebRtcSpl_MinValueW32Neon
.align 2
@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
WebRtcSpl_MaxAbsValueW16Neon:
.fnstart
mov r2, #-1 @ Initialize the return value.
cmp r0, #0
beq END_MAX_ABS_VALUE_W16
cmp r1, #0
ble END_MAX_ABS_VALUE_W16
cmp r1, #8
blt LOOP_MAX_ABS_VALUE_W16
vmov.i16 q12, #0
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
vld1.16 {q13}, [r0]!
subs r1, #8
vabs.s16 q13, q13 @ Note vabs doesn't change the value of -32768.
vmax.u16 q12, q13 @ Use u16 so we don't lose the value -32768.
bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16
@ Find the maximum value in the Neon registers and move it to r2.
vmax.u16 d24, d25
vpmax.u16 d24, d24
vpmax.u16 d24, d24
adds r1, #8
vmov.u16 r2, d24[0]
beq END_MAX_ABS_VALUE_W16
LOOP_MAX_ABS_VALUE_W16:
ldrsh r3, [r0], #2
eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
sub r12, r12, r3, asr #31
cmp r2, r12
movlt r2, r12
subs r1, #1
bne LOOP_MAX_ABS_VALUE_W16
END_MAX_ABS_VALUE_W16:
cmp r2, #0x8000 @ Guard against the case for -32768.
subeq r2, #1
mov r0, r2
bx lr
.fnend
@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
WebRtcSpl_MaxAbsValueW32Neon:
.fnstart
cmp r0, #0
moveq r0, #-1
beq EXIT @ Return -1 for a NULL pointer.
cmp r1, #0 @ length
movle r0, #-1
ble EXIT @ Return -1 if length <= 0.
vmov.i32 q11, #0
vmov.i32 q12, #0
cmp r1, #8
blt LOOP_MAX_ABS_VALUE_W32
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
vld1.32 {q13, q14}, [r0]!
subs r1, #8 @ Counter for loops
vabs.s32 q13, q13 @ vabs doesn't change the value of 0x80000000.
vabs.s32 q14, q14
vmax.u32 q11, q13 @ Use u32 so we don't lose the value 0x80000000.
vmax.u32 q12, q14
bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32
@ Find the maximum value in the Neon registers and move it to r2.
vmax.u32 q12, q11
vmax.u32 d24, d25
vpmax.u32 d24, d24
adds r1, #8
vmov.u32 r2, d24[0]
beq END_MAX_ABS_VALUE_W32
LOOP_MAX_ABS_VALUE_W32:
ldr r3, [r0], #4
eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
sub r12, r12, r3, asr #31
cmp r2, r12
movcc r2, r12
subs r1, #1
bne LOOP_MAX_ABS_VALUE_W32
END_MAX_ABS_VALUE_W32:
mvn r0, #0x80000000 @ Guard against the case for 0x80000000.
cmp r2, r0
movcc r0, r2
EXIT:
bx lr
.fnend
@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
WebRtcSpl_MaxValueW16Neon:
.fnstart
mov r2, #0x8000 @ Initialize the return value.
cmp r0, #0
beq END_MAX_VALUE_W16
cmp r1, #0
ble END_MAX_VALUE_W16
vmov.i16 q12, #0x8000
cmp r1, #8
blt LOOP_MAX_VALUE_W16
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
vld1.16 {q13}, [r0]!
subs r1, #8
vmax.s16 q12, q13
bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16
@ Find the maximum value in the Neon registers and move it to r2.
vmax.s16 d24, d25
vpmax.s16 d24, d24
vpmax.s16 d24, d24
adds r1, #8
vmov.u16 r2, d24[0]
beq END_MAX_VALUE_W16
LOOP_MAX_VALUE_W16:
ldrsh r3, [r0], #2
cmp r2, r3
movlt r2, r3
subs r1, #1
bne LOOP_MAX_VALUE_W16
END_MAX_VALUE_W16:
mov r0, r2
bx lr
.fnend
@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
WebRtcSpl_MaxValueW32Neon:
.fnstart
mov r2, #0x80000000 @ Initialize the return value.
cmp r0, #0
beq END_MAX_VALUE_W32
cmp r1, #0
ble END_MAX_VALUE_W32
vmov.i32 q11, #0x80000000
vmov.i32 q12, #0x80000000
cmp r1, #8
blt LOOP_MAX_VALUE_W32
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
vld1.32 {q13, q14}, [r0]!
subs r1, #8
vmax.s32 q11, q13
vmax.s32 q12, q14
bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32
@ Find the maximum value in the Neon registers and move it to r2.
vmax.s32 q12, q11
vpmax.s32 d24, d25
vpmax.s32 d24, d24
adds r1, #8
vmov.s32 r2, d24[0]
beq END_MAX_VALUE_W32
LOOP_MAX_VALUE_W32:
ldr r3, [r0], #4
cmp r2, r3
movlt r2, r3
subs r1, #1
bne LOOP_MAX_VALUE_W32
END_MAX_VALUE_W32:
mov r0, r2
bx lr
.fnend
@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
WebRtcSpl_MinValueW16Neon:
.fnstart
movw r2, #0x7FFF @ Initialize the return value.
cmp r0, #0
beq END_MIN_VALUE_W16
cmp r1, #0
ble END_MIN_VALUE_W16
vmov.i16 q12, #0x7FFF
cmp r1, #8
blt LOOP_MIN_VALUE_W16
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
vld1.16 {q13}, [r0]!
subs r1, #8
vmin.s16 q12, q13
bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16
@ Find the maximum value in the Neon registers and move it to r2.
vmin.s16 d24, d25
vpmin.s16 d24, d24
vpmin.s16 d24, d24
adds r1, #8
vmov.s16 r2, d24[0]
sxth r2, r2
beq END_MIN_VALUE_W16
LOOP_MIN_VALUE_W16:
ldrsh r3, [r0], #2
cmp r2, r3
movge r2, r3
subs r1, #1
bne LOOP_MIN_VALUE_W16
END_MIN_VALUE_W16:
mov r0, r2
bx lr
.fnend
@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
WebRtcSpl_MinValueW32Neon:
.fnstart
mov r2, #0x7FFFFFFF @ Initialize the return value.
cmp r0, #0
beq END_MIN_VALUE_W32
cmp r1, #0
ble END_MIN_VALUE_W32
vdup.32 q11, r2
vdup.32 q12, r2
cmp r1, #8
blt LOOP_MIN_VALUE_W32
sub r1, #8 @ Counter for loops
LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
vld1.32 {q13, q14}, [r0]!
subs r1, #8
vmin.s32 q11, q13
vmin.s32 q12, q14
bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32
@ Find the maximum value in the Neon registers and move it to r2.
vmin.s32 q12, q11
vpmin.s32 d24, d25
vpmin.s32 d24, d24
adds r1, #8
vmov.s32 r2, d24[0]
beq END_MIN_VALUE_W32
LOOP_MIN_VALUE_W32:
ldr r3, [r0], #4
cmp r2, r3
movge r2, r3
subs r1, #1
bne LOOP_MIN_VALUE_W32
END_MIN_VALUE_W32:
mov r0, r2
bx lr
.fnend