; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
EXPORT |vp8_sixtap_predict8x4_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack unsigned char *dst_ptr,
; stack int dst_pitch
;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
;and the result is stored in transpose.
|vp8_sixtap_predict8x4_armv6| PROC
stmdb sp!, {r4 - r11, lr}
sub sp, sp, #184 ;reserve space on stack for temporary storage: 20x(8+1) +4
cmp r2, #0 ;skip first_pass filter if xoffset=0
str r3, [sp], #4 ;store yoffset
beq skip_firstpass_filter
;first-pass filter
ldr r12, _filter8_coeff_
sub r0, r0, r1, lsl #1
add r2, r12, r2, lsl #4 ;calculate filter location
add r0, r0, #3 ;adjust src only for loading convinience
ldr r3, [r2] ; load up packed filter coefficients
ldr r4, [r2, #4]
ldr r5, [r2, #8]
mov r2, #0x90000 ; height=9 is top part of counter
sub r1, r1, #8
mov lr, #20
ldrb r6, [r0, #-5] ; load source data
ldrb r7, [r0, #-4]
ldrb r8, [r0, #-3]
ldrb r9, [r0, #-2]
ldrb r10, [r0, #-1]
orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
pkhbt r6, r6, r7, lsl #16 ; r7 | r6
pkhbt r7, r7, r8, lsl #16 ; r8 | r7
pkhbt r8, r8, r9, lsl #16 ; r9 | r8
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1]
smuad r12, r7, r3
ldrb r6, [r0], #1
smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3]
ldrb r7, [r0], #1
smlad r12, r9, r4, r12
pkhbt r10, r10, r6, lsl #16 ; r10 | r9
pkhbt r6, r6, r7, lsl #16 ; r11 | r10
smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5]
smlad r12, r6, r5, r12
sub r2, r2, #1
add r11, r11, #0x40 ; round_shift_and_clamp
tst r2, #0xff ; test loop counter
usat r11, #8, r11, asr #7
add r12, r12, #0x40
strh r11, [sp], lr ; result is transposed and stored, which
usat r12, #8, r12, asr #7
strh r12, [sp], lr
movne r11, r6
movne r12, r7
movne r6, r8
movne r7, r9
movne r8, r10
movne r9, r11
movne r10, r12
bne first_pass_wloop_v6
;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
;pld [src, ppl]
;;pld [src, r9]
subs r2, r2, #0x10000
mov r6, #158
sub sp, sp, r6
add r0, r0, r1 ; move to next input line
bne first_pass_hloop_v6
;second pass filter
mov r1, #18
sub sp, sp, r1 ; 18+4
ldr r3, [sp, #-4] ; load back yoffset
ldr r0, [sp, #216] ; load dst address from stack 180+36
ldr r1, [sp, #220] ; load dst stride from stack 180+40
cmp r3, #0
beq skip_secondpass_filter
ldr r12, _filter8_coeff_
add lr, r12, r3, lsl #4 ;calculate filter location
mov r2, #0x00080000
ldr r3, [lr] ; load up packed filter coefficients
ldr r4, [lr, #4]
ldr r5, [lr, #8]
pkhbt r12, r4, r3 ; pack the filter differently
pkhbt r11, r5, r4
ldr r6, [sp] ; load the data
ldr r7, [sp, #4]
orr r2, r2, #2 ; loop counter
smuad lr, r3, r6 ; apply filter
smulbt r10, r3, r6
ldr r8, [sp, #8]
smlad lr, r4, r7, lr
smladx r10, r12, r7, r10
ldrh r9, [sp, #12]
smlad lr, r5, r8, lr
smladx r10, r11, r8, r10
add sp, sp, #4
smlatb r10, r5, r9, r10
sub r2, r2, #1
add lr, lr, #0x40 ; round_shift_and_clamp
tst r2, #0xff
usat lr, #8, lr, asr #7
add r10, r10, #0x40
strb lr, [r0], r1 ; the result is transposed back and stored
usat r10, #8, r10, asr #7
strb r10, [r0],r1
movne r6, r7
movne r7, r8
bne second_pass_wloop_v6
subs r2, r2, #0x10000
add sp, sp, #12 ; updata src for next loop (20-8)
sub r0, r0, r1, lsl #2
add r0, r0, #1
bne second_pass_hloop_v6
add sp, sp, #20
ldmia sp!, {r4 - r11, pc}
sub r0, r0, r1, lsl #1
sub r1, r1, #8
mov r2, #9
mov r3, #20
ldrb r4, [r0], #1 ; load data
subs r2, r2, #1
ldrb r5, [r0], #1
strh r4, [sp], r3 ; store it to immediate buffer
ldrb r6, [r0], #1 ; load data
strh r5, [sp], r3
ldrb r7, [r0], #1
strh r6, [sp], r3
ldrb r8, [r0], #1
strh r7, [sp], r3
ldrb r9, [r0], #1
strh r8, [sp], r3
ldrb r10, [r0], #1
strh r9, [sp], r3
ldrb r11, [r0], #1
strh r10, [sp], r3
add r0, r0, r1 ; move to next input line
strh r11, [sp], r3
mov r4, #158
sub sp, sp, r4 ; move over to next column
bne skip_firstpass_hloop
b secondpass_filter
mov r2, #8
add sp, sp, #4 ;start from src[0] instead of src[-2]
ldr r6, [sp], #4
subs r2, r2, #1
ldr r8, [sp], #4
mov r7, r6, lsr #16 ; unpack
strb r6, [r0], r1
mov r9, r8, lsr #16
strb r7, [r0], r1
add sp, sp, #12 ; 20-8
strb r8, [r0], r1
strb r9, [r0], r1
sub r0, r0, r1, lsl #2
add r0, r0, #1
bne skip_secondpass_hloop
add sp, sp, #16 ; 180 - (160 +4)
ldmia sp!, {r4 - r11, pc}
AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
DCD filter8_coeff
DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
;DCD 0, 0, 128, 0, 0, 0
;DCD 0, -6, 123, 12, -1, 0
;DCD 2, -11, 108, 36, -8, 1
;DCD 0, -9, 93, 50, -6, 0
;DCD 3, -16, 77, 77, -16, 3
;DCD 0, -6, 50, 93, -9, 0
;DCD 1, -8, 36, 108, -11, 2
;DCD 0, -1, 12, 123, -6, 0