| ;*! |
| ;* \copy |
| ;* Copyright (c) 2010-2013, Cisco Systems |
| ;* All rights reserved. |
| ;* |
| ;* Redistribution and use in source and binary forms, with or without |
| ;* modification, are permitted provided that the following conditions |
| ;* are met: |
| ;* |
| ;* * Redistributions of source code must retain the above copyright |
| ;* notice, this list of conditions and the following disclaimer. |
| ;* |
| ;* * Redistributions in binary form must reproduce the above copyright |
| ;* notice, this list of conditions and the following disclaimer in |
| ;* the documentation and/or other materials provided with the |
| ;* distribution. |
| ;* |
| ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
| ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| ;* POSSIBILITY OF SUCH DAMAGE. |
| ;* |
| ;* |
| ;* vaa.asm |
| ;* |
| ;* Abstract |
| ;* sse2 for pVaa routines |
| ;* |
| ;* History |
| ;* 04/14/2010 Created |
| ;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3) |
| ;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement |
| ;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 |
| ;* |
| ;*************************************************************************/ |
| %include "asm_inc.asm" |
| |
| |
| ;*********************************************************************** |
| ; Macros and other preprocessor constants |
| ;*********************************************************************** |
| %macro SUM_SQR_SSE2 3 ; dst, pSrc, zero |
| movdqa %1, %2 |
| punpcklbw %1, %3 |
| punpckhbw %2, %3 |
| pmaddwd %1, %1 |
| pmaddwd %2, %2 |
| paddd %1, %2 |
| pshufd %2, %1, 04Eh ; 01001110 B |
| paddd %1, %2 |
| pshufd %2, %1, 0B1h ; 10110001 B |
| paddd %1, %2 |
| %endmacro ; END OF SUM_SQR_SSE2 |
| |
| %macro WELS_SAD_16x2_SSE2 3 ;esi :%1 edi:%2 ebx:%3 |
| movdqa xmm1, [%1] |
| movdqa xmm2, [%2] |
| movdqa xmm3, [%1+%3] |
| movdqa xmm4, [%2+%3] |
| psadbw xmm1, xmm2 |
| psadbw xmm3, xmm4 |
| paddd xmm6, xmm1 |
| paddd xmm6, xmm3 |
| lea %1, [%1+%3*2] |
| lea %2, [%2+%3*2] |
| %endmacro |
| |
| ; by comparing it outperforms than phaddw(SSSE3) sets |
| %macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp |
| ; @sum_8x2 begin |
| pshufd %2, %1, 04Eh ; 01001110 B |
| paddw %1, %2 |
| pshuflw %2, %1, 04Eh ; 01001110 B |
| paddw %1, %2 |
| pshuflw %2, %1, 0B1h ; 10110001 B |
| paddw %1, %2 |
| ; end of @sum_8x2 |
| %endmacro ; END of SUM_WORD_8x2_SSE2 |
| |
| %macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3 |
| movdqa xmm1, [%1] |
| movdqa xmm2, [%2] |
| movdqa xmm3, xmm1 |
| psadbw xmm3, xmm2 |
| paddd xmm6, xmm3 |
| |
| movdqa xmm3, xmm1 |
| psadbw xmm3, xmm0 |
| paddd xmm5, xmm3 |
| |
| movdqa xmm2, xmm1 |
| punpcklbw xmm1, xmm0 |
| punpckhbw xmm2, xmm0 |
| pmaddwd xmm1, xmm1 |
| pmaddwd xmm2, xmm2 |
| paddd xmm4, xmm1 |
| paddd xmm4, xmm2 |
| |
| add %1, %3 |
| add %2, %3 |
| %endmacro |
| |
| %macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3 |
| movdqa xmm1, [%1] |
| movdqa xmm2, [%2] |
| movdqa xmm3, xmm1 |
| psadbw xmm3, xmm2 |
| paddd xmm7, xmm3 ; sad |
| |
| movdqa xmm3, xmm1 |
| pmaxub xmm3, xmm2 |
| pminub xmm2, xmm1 |
| psubb xmm3, xmm2 ; diff |
| |
| movdqa xmm2, xmm1 |
| psadbw xmm2, xmm0 |
| paddd xmm6, xmm2 ; sum |
| |
| movdqa xmm2, xmm1 |
| punpcklbw xmm1, xmm0 |
| punpckhbw xmm2, xmm0 |
| pmaddwd xmm1, xmm1 |
| pmaddwd xmm2, xmm2 |
| paddd xmm5, xmm1 |
| paddd xmm5, xmm2 ; sqsum |
| |
| movdqa xmm1, xmm3 |
| punpcklbw xmm1, xmm0 |
| punpckhbw xmm3, xmm0 |
| pmaddwd xmm1, xmm1 |
| pmaddwd xmm3, xmm3 |
| paddd xmm4, xmm1 |
| paddd xmm4, xmm3 ; sqdiff |
| |
| add %1, %3 |
| add %2, %3 |
| %endmacro |
| |
| %macro WELS_SAD_SD_MAD_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7 |
| %define sad_reg %1 |
| %define sum_cur_reg %2 |
| %define sum_ref_reg %3 |
| %define mad_reg %4 |
| movdqa xmm1, [%5] |
| movdqa xmm2, [%6] |
| movdqa xmm3, xmm1 |
| psadbw xmm3, xmm0 |
| paddd sum_cur_reg, xmm3 ; sum_cur |
| movdqa xmm3, xmm2 |
| psadbw xmm3, xmm0 |
| paddd sum_ref_reg, xmm3 ; sum_ref |
| |
| movdqa xmm3, xmm1 |
| pmaxub xmm3, xmm2 |
| pminub xmm2, xmm1 |
| psubb xmm3, xmm2 ; abs diff |
| pmaxub mad_reg, xmm3 ; max abs diff |
| |
| psadbw xmm3, xmm0 |
| paddd sad_reg, xmm3 ; sad |
| |
| add %5, %7 |
| add %6, %7 |
| %endmacro |
| |
| |
| %macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used |
| %define max_reg %1 |
| movdqa xmm1, max_reg |
| psrldq xmm1, 4 |
| pmaxub max_reg, xmm1 |
| movdqa xmm1, max_reg |
| psrldq xmm1, 2 |
| pmaxub max_reg, xmm1 |
| movdqa xmm1, max_reg |
| psrldq xmm1, 1 |
| pmaxub max_reg, xmm1 |
| %endmacro |
| |
| %macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7 |
| %define sad_reg %1 |
| %define sum_reg %2 |
| %define mad_reg %3 |
| %define sqdiff_reg %4 |
| movdqa xmm1, [%5] |
| movdqa xmm2, xmm1 |
| movdqa xmm3, xmm1 |
| punpcklbw xmm2, xmm0 |
| punpckhbw xmm3, xmm0 |
| pmaddwd xmm2, xmm2 |
| pmaddwd xmm3, xmm3 |
| paddd xmm2, xmm3 |
| movdqa xmm3, xmm2 |
| psllq xmm2, 32 |
| psrlq xmm3, 32 |
| psllq xmm3, 32 |
| paddd xmm2, xmm3 |
| paddd sad_reg, xmm2 ; sqsum |
| |
| movdqa xmm2, [%6] |
| movdqa xmm3, xmm1 |
| psadbw xmm3, xmm0 |
| paddd sum_reg, xmm3 ; sum_cur |
| movdqa xmm3, xmm2 |
| psadbw xmm3, xmm0 |
| pslldq xmm3, 4 |
| paddd sum_reg, xmm3 ; sum_ref |
| |
| movdqa xmm3, xmm1 |
| pmaxub xmm3, xmm2 |
| pminub xmm2, xmm1 |
| psubb xmm3, xmm2 ; abs diff |
| pmaxub mad_reg, xmm3 ; max abs diff |
| |
| movdqa xmm1, xmm3 |
| psadbw xmm3, xmm0 |
| paddd sad_reg, xmm3 ; sad |
| |
| movdqa xmm3, xmm1 |
| punpcklbw xmm1, xmm0 |
| punpckhbw xmm3, xmm0 |
| pmaddwd xmm1, xmm1 |
| pmaddwd xmm3, xmm3 |
| paddd sqdiff_reg, xmm1 |
| paddd sqdiff_reg, xmm3 ; sqdiff |
| |
| add %5, %7 |
| add %6, %7 |
| %endmacro |
| |
| |
| ;*********************************************************************** |
| ; Code |
| ;*********************************************************************** |
| |
| SECTION .text |
| |
| %ifdef X86_32 |
| |
| ;*********************************************************************** |
| ; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture ); |
| ;*********************************************************************** |
| WELS_EXTERN SampleVariance16x16_sse2 |
| push esi |
| push edi |
| push ebx |
| |
| sub esp, 16 |
| %define SUM [esp] |
| %define SUM_CUR [esp+4] |
| %define SQR [esp+8] |
| %define SQR_CUR [esp+12] |
| %define PUSH_SIZE 28 ; 12 + 16 |
| |
| mov edi, [esp+PUSH_SIZE+4] ; y_ref |
| mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride |
| mov esi, [esp+PUSH_SIZE+12] ; y_src |
| mov eax, [esp+PUSH_SIZE+16] ; y_src_stride |
| mov ecx, 010h ; height = 16 |
| |
| pxor xmm7, xmm7 |
| movdqu SUM, xmm7 |
| |
| .hloops: |
| movdqa xmm0, [edi] ; y_ref |
| movdqa xmm1, [esi] ; y_src |
| movdqa xmm2, xmm0 ; store first for future process |
| movdqa xmm3, xmm1 |
| ; sum += diff; |
| movdqa xmm4, xmm0 |
| psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79] |
| ; to be continued for sum |
| pshufd xmm5, xmm4, 0C6h ; 11000110 B |
| paddw xmm4, xmm5 |
| movd ebx, xmm4 |
| add SUM, ebx |
| |
| ; sqr += diff * diff; |
| pmaxub xmm0, xmm1 |
| pminub xmm1, xmm2 |
| psubb xmm0, xmm1 ; diff |
| SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero |
| movd ebx, xmm1 |
| add SQR, ebx |
| |
| ; sum_cur += y_src[x]; |
| movdqa xmm0, xmm3 ; cur_orig |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, xmm7 |
| punpckhbw xmm1, xmm7 |
| paddw xmm0, xmm1 ; 8x2 |
| SUM_WORD_8x2_SSE2 xmm0, xmm1 |
| movd ebx, xmm0 |
| and ebx, 0ffffh |
| add SUM_CUR, ebx |
| |
| ; sqr_cur += y_src[x] * y_src[x]; |
| SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero |
| movd ebx, xmm0 |
| add SQR_CUR, ebx |
| |
| lea edi, [edi+edx] |
| lea esi, [esi+eax] |
| dec ecx |
| jnz near .hloops |
| |
| mov ebx, 0 |
| mov bx, word SUM |
| sar ebx, 8 |
| imul ebx, ebx |
| mov ecx, SQR |
| sar ecx, 8 |
| sub ecx, ebx |
| mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture |
| mov [edi], cx ; to store uiMotionIndex |
| mov ebx, 0 |
| mov bx, word SUM_CUR |
| sar ebx, 8 |
| imul ebx, ebx |
| mov ecx, SQR_CUR |
| sar ecx, 8 |
| sub ecx, ebx |
| mov [edi+2], cx ; to store uiTextureIndex |
| |
| %undef SUM |
| %undef SUM_CUR |
| %undef SQR |
| %undef SQR_CUR |
| %undef PUSH_SIZE |
| |
| add esp, 16 |
| pop ebx |
| pop edi |
| pop esi |
| |
| ret |
| |
| |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8) |
| ;************************************************************************************************************* |
| |
| |
| WELS_EXTERN VAACalcSad_sse2 |
| %define cur_data esp + pushsize + 4 |
| %define ref_data esp + pushsize + 8 |
| %define iPicWidth esp + pushsize + 12 |
| %define iPicHeight esp + pushsize + 16 |
| %define iPicStride esp + pushsize + 20 |
| %define psadframe esp + pushsize + 24 |
| %define psad8x8 esp + pushsize + 28 |
| %define pushsize 12 |
| push esi |
| push edi |
| push ebx |
| mov esi, [cur_data] |
| mov edi, [ref_data] |
| mov ebx, [iPicStride] |
| mov edx, [psad8x8] |
| mov eax, ebx |
| |
| shr dword [iPicWidth], 4 ; iPicWidth/16 |
| shr dword [iPicHeight], 4 ; iPicHeight/16 |
| shl eax, 4 ; iPicStride*16 |
| pxor xmm0, xmm0 |
| pxor xmm7, xmm7 ; iFrameSad |
| height_loop: |
| mov ecx, dword [iPicWidth] |
| push esi |
| push edi |
| width_loop: |
| pxor xmm6, xmm6 ; |
| WELS_SAD_16x2_SSE2 esi,edi,ebx |
| WELS_SAD_16x2_SSE2 esi,edi,ebx |
| WELS_SAD_16x2_SSE2 esi,edi,ebx |
| WELS_SAD_16x2_SSE2 esi,edi,ebx |
| paddd xmm7, xmm6 |
| movd [edx], xmm6 |
| psrldq xmm6, 8 |
| movd [edx+4], xmm6 |
| |
| pxor xmm6, xmm6 |
| WELS_SAD_16x2_SSE2 esi,edi,ebx |
| WELS_SAD_16x2_SSE2 esi,edi,ebx |
| WELS_SAD_16x2_SSE2 esi,edi,ebx |
| WELS_SAD_16x2_SSE2 esi,edi,ebx |
| paddd xmm7, xmm6 |
| movd [edx+8], xmm6 |
| psrldq xmm6, 8 |
| movd [edx+12], xmm6 |
| |
| add edx, 16 |
| sub esi, eax |
| sub edi, eax |
| add esi, 16 |
| add edi, 16 |
| |
| dec ecx |
| jnz width_loop |
| |
| pop edi |
| pop esi |
| add esi, eax |
| add edi, eax |
| |
| dec dword [iPicHeight] |
| jnz height_loop |
| |
| mov edx, [psadframe] |
| movdqa xmm5, xmm7 |
| psrldq xmm7, 8 |
| paddd xmm7, xmm5 |
| movd [edx], xmm7 |
| |
| %undef cur_data |
| %undef ref_data |
| %undef iPicWidth |
| %undef iPicHeight |
| %undef iPicStride |
| %undef psadframe |
| %undef psad8x8 |
| %undef pushsize |
| pop ebx |
| pop edi |
| pop esi |
| ret |
| |
| %else ;64-bit |
| |
| ;*********************************************************************** |
| ; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture ); |
| ;*********************************************************************** |
| WELS_EXTERN SampleVariance16x16_sse2 |
| %define SUM r10;[esp] |
| %define SUM_CUR r11;[esp+4] |
| %define SQR r13;[esp+8] |
| %define SQR_CUR r15;[esp+12] |
| |
| push r12 |
| push r13 |
| push r14 |
| push r15 |
| %assign push_num 4 |
| LOAD_5_PARA |
| PUSH_XMM 8 |
| SIGN_EXTENSION r1,r1d |
| SIGN_EXTENSION r3,r3d |
| |
| mov r12,010h |
| pxor xmm7, xmm7 |
| movq SUM, xmm7 |
| movq SUM_CUR,xmm7 |
| movq SQR,xmm7 |
| movq SQR_CUR,xmm7 |
| |
| .hloops: |
| mov r14,0 |
| movdqa xmm0, [r0] ; y_ref |
| movdqa xmm1, [r2] ; y_src |
| movdqa xmm2, xmm0 ; store first for future process |
| movdqa xmm3, xmm1 |
| ; sum += diff; |
| movdqa xmm4, xmm0 |
| psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79] |
| ; to be continued for sum |
| pshufd xmm5, xmm4, 0C6h ; 11000110 B |
| paddw xmm4, xmm5 |
| movd r14d, xmm4 |
| add SUM, r14 |
| |
| ; sqr += diff * diff; |
| pmaxub xmm0, xmm1 |
| pminub xmm1, xmm2 |
| psubb xmm0, xmm1 ; diff |
| SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero |
| movd r14d, xmm1 |
| add SQR, r14 |
| |
| ; sum_cur += y_src[x]; |
| movdqa xmm0, xmm3 ; cur_orig |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, xmm7 |
| punpckhbw xmm1, xmm7 |
| paddw xmm0, xmm1 ; 8x2 |
| SUM_WORD_8x2_SSE2 xmm0, xmm1 |
| movd r14d, xmm0 |
| and r14, 0ffffh |
| add SUM_CUR, r14 |
| |
| ; sqr_cur += y_src[x] * y_src[x]; |
| SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero |
| movd r14d, xmm0 |
| add SQR_CUR, r14 |
| |
| lea r0, [r0+r1] |
| lea r2, [r2+r3] |
| dec r12 |
| jnz near .hloops |
| |
| mov r0, SUM |
| sar r0, 8 |
| imul r0, r0 |
| mov r1, SQR |
| sar r1, 8 |
| sub r1, r0 |
| mov [r4], r1w ; to store uiMotionIndex |
| mov r0, SUM_CUR |
| sar r0, 8 |
| imul r0, r0 |
| mov r1, SQR_CUR |
| sar r1, 8 |
| sub r1, r0 |
| mov [r4+2], r1w ; to store uiTextureIndex |
| |
| POP_XMM |
| LOAD_5_PARA_POP |
| pop r15 |
| pop r14 |
| pop r13 |
| pop r12 |
| |
| |
| %assign push_num 0 |
| |
| ret |
| |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8) |
| ;************************************************************************************************************* |
| |
| |
| WELS_EXTERN VAACalcSad_sse2 |
| %define cur_data r0 |
| %define ref_data r1 |
| %define iPicWidth r2 |
| %define iPicHeight r3 |
| %define iPicStride r4 |
| %define psadframe r5 |
| %define psad8x8 r6 |
| |
| push r12 |
| push r13 |
| %assign push_num 2 |
| LOAD_7_PARA |
| PUSH_XMM 8 |
| SIGN_EXTENSION r2,r2d |
| SIGN_EXTENSION r3,r3d |
| SIGN_EXTENSION r4,r4d |
| |
| mov r12,r4 |
| shr r2, 4 ; iPicWidth/16 |
| shr r3, 4 ; iPicHeight/16 |
| |
| shl r12, 4 ; iPicStride*16 |
| pxor xmm0, xmm0 |
| pxor xmm7, xmm7 ; iFrameSad |
| height_loop: |
| mov r13, r2 |
| push r0 |
| push r1 |
| width_loop: |
| pxor xmm6, xmm6 |
| WELS_SAD_16x2_SSE2 r0,r1,r4 |
| WELS_SAD_16x2_SSE2 r0,r1,r4 |
| WELS_SAD_16x2_SSE2 r0,r1,r4 |
| WELS_SAD_16x2_SSE2 r0,r1,r4 |
| paddd xmm7, xmm6 |
| movd [r6], xmm6 |
| psrldq xmm6, 8 |
| movd [r6+4], xmm6 |
| |
| pxor xmm6, xmm6 |
| WELS_SAD_16x2_SSE2 r0,r1,r4 |
| WELS_SAD_16x2_SSE2 r0,r1,r4 |
| WELS_SAD_16x2_SSE2 r0,r1,r4 |
| WELS_SAD_16x2_SSE2 r0,r1,r4 |
| paddd xmm7, xmm6 |
| movd [r6+8], xmm6 |
| psrldq xmm6, 8 |
| movd [r6+12], xmm6 |
| |
| add r6, 16 |
| sub r0, r12 |
| sub r1, r12 |
| add r0, 16 |
| add r1, 16 |
| |
| dec r13 |
| jnz width_loop |
| |
| pop r1 |
| pop r0 |
| add r0, r12 |
| add r1, r12 |
| |
| dec r3 |
| jnz height_loop |
| |
| ;mov r13, [psadframe] |
| movdqa xmm5, xmm7 |
| psrldq xmm7, 8 |
| paddd xmm7, xmm5 |
| movd [psadframe], xmm7 |
| |
| %undef cur_data |
| %undef ref_data |
| %undef iPicWidth |
| %undef iPicHeight |
| %undef iPicStride |
| %undef psadframe |
| %undef psad8x8 |
| %undef pushsize |
| POP_XMM |
| LOAD_7_PARA_POP |
| pop r13 |
| pop r12 |
| %assign push_num 0 |
| ret |
| |
| %endif |
| |
| |
| %ifdef X86_32 |
| ;************************************************************************************************************* |
| ;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16) |
| ;************************************************************************************************************* |
| |
| |
| WELS_EXTERN VAACalcSadVar_sse2 |
| %define localsize 8 |
| %define cur_data esp + pushsize + localsize + 4 |
| %define ref_data esp + pushsize + localsize + 8 |
| %define iPicWidth esp + pushsize + localsize + 12 |
| %define iPicHeight esp + pushsize + localsize + 16 |
| %define iPicStride esp + pushsize + localsize + 20 |
| %define psadframe esp + pushsize + localsize + 24 |
| %define psad8x8 esp + pushsize + localsize + 28 |
| %define psum16x16 esp + pushsize + localsize + 32 |
| %define psqsum16x16 esp + pushsize + localsize + 36 |
| %define tmp_esi esp + 0 |
| %define tmp_edi esp + 4 |
| %define pushsize 16 |
| push ebp |
| push esi |
| push edi |
| push ebx |
| sub esp, localsize |
| mov esi, [cur_data] |
| mov edi, [ref_data] |
| mov ebx, [iPicStride] |
| mov edx, [psad8x8] |
| mov eax, ebx |
| |
| shr dword [iPicWidth], 4 ; iPicWidth/16 |
| shr dword [iPicHeight], 4 ; iPicHeight/16 |
| shl eax, 4 ; iPicStride*16 |
| pxor xmm0, xmm0 |
| pxor xmm7, xmm7 ; iFrameSad |
| var_height_loop: |
| mov ecx, dword [iPicWidth] |
| mov [tmp_esi], esi |
| mov [tmp_edi], edi |
| var_width_loop: |
| pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8 |
| pxor xmm5, xmm5 ; pSum16x16 |
| pxor xmm4, xmm4 ; sqsum_16x16 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| paddd xmm7, xmm6 |
| movd [edx], xmm6 |
| psrldq xmm6, 8 |
| movd [edx+4], xmm6 |
| |
| pxor xmm6, xmm6 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx |
| paddd xmm7, xmm6 |
| movd [edx+8], xmm6 |
| psrldq xmm6, 8 |
| movd [edx+12], xmm6 |
| |
| mov ebp, [psum16x16] |
| movdqa xmm1, xmm5 |
| psrldq xmm1, 8 |
| paddd xmm5, xmm1 |
| movd [ebp], xmm5 |
| add dword [psum16x16], 4 |
| |
| movdqa xmm5, xmm4 |
| psrldq xmm5, 8 |
| paddd xmm4, xmm5 |
| movdqa xmm3, xmm4 |
| psrldq xmm3, 4 |
| paddd xmm4, xmm3 |
| |
| mov ebp, [psqsum16x16] |
| movd [ebp], xmm4 |
| add dword [psqsum16x16], 4 |
| |
| add edx, 16 |
| sub esi, eax |
| sub edi, eax |
| add esi, 16 |
| add edi, 16 |
| |
| dec ecx |
| jnz var_width_loop |
| |
| mov esi, [tmp_esi] |
| mov edi, [tmp_edi] |
| add esi, eax |
| add edi, eax |
| |
| dec dword [iPicHeight] |
| jnz var_height_loop |
| |
| mov edx, [psadframe] |
| movdqa xmm5, xmm7 |
| psrldq xmm7, 8 |
| paddd xmm7, xmm5 |
| movd [edx], xmm7 |
| |
| add esp, localsize |
| pop ebx |
| pop edi |
| pop esi |
| pop ebp |
| %undef cur_data |
| %undef ref_data |
| %undef iPicWidth |
| %undef iPicHeight |
| %undef iPicStride |
| %undef psadframe |
| %undef psad8x8 |
| %undef psum16x16 |
| %undef psqsum16x16 |
| %undef tmp_esi |
| %undef tmp_edi |
| %undef pushsize |
| %undef localsize |
| ret |
| |
| %else ;64-bit |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16) |
| ;************************************************************************************************************* |
| |
| |
| WELS_EXTERN VAACalcSadVar_sse2 |
| %define cur_data arg1 ;r0 |
| %define ref_data arg2 ;r1 |
| %define iPicWidth arg3 ;r2 |
| %define iPicHeight arg4 ;r3 |
| %define iPicStride arg5 |
| %define psadframe arg6 |
| %define psad8x8 arg7 |
| %define psum16x16 arg8 |
| %define psqsum16x16 arg9 |
| |
| push r12 |
| push r13 |
| push r14 |
| push r15 |
| %assign push_num 4 |
| PUSH_XMM 8 |
| |
| %ifdef WIN64 |
| mov r4, arg5 ;iPicStride |
| mov r5, arg6 ;psad8x8 |
| %endif |
| mov r14,arg7 |
| SIGN_EXTENSION r2,r2d |
| SIGN_EXTENSION r3,r3d |
| SIGN_EXTENSION r4,r4d |
| |
| mov r13,r4 |
| shr r2,4 |
| shr r3,4 |
| |
| shl r13,4 ; iPicStride*16 |
| pxor xmm0, xmm0 |
| pxor xmm7, xmm7 ; iFrameSad |
| var_height_loop: |
| push r2 |
| %assign push_num push_num+1 |
| mov r11, r0 |
| mov r12, r1 |
| var_width_loop: |
| pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8 |
| pxor xmm5, xmm5 ; pSum16x16 |
| pxor xmm4, xmm4 ; sqsum_16x16 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| paddd xmm7, xmm6 |
| movd [r14], xmm6 |
| psrldq xmm6, 8 |
| movd [r14+4], xmm6 |
| |
| pxor xmm6, xmm6 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 |
| paddd xmm7, xmm6 |
| movd [r14+8], xmm6 |
| psrldq xmm6, 8 |
| movd [r14+12], xmm6 |
| |
| mov r15, psum16x16 |
| movdqa xmm1, xmm5 |
| psrldq xmm1, 8 |
| paddd xmm5, xmm1 |
| movd [r15], xmm5 |
| add dword psum16x16, 4 |
| |
| movdqa xmm5, xmm4 |
| psrldq xmm5, 8 |
| paddd xmm4, xmm5 |
| movdqa xmm3, xmm4 |
| psrldq xmm3, 4 |
| paddd xmm4, xmm3 |
| |
| mov r15, psqsum16x16 |
| movd [r15], xmm4 |
| add dword psqsum16x16, 4 |
| |
| add r14,16 |
| sub r0, r13 |
| sub r1, r13 |
| add r0, 16 |
| add r1, 16 |
| |
| dec r2 |
| jnz var_width_loop |
| |
| pop r2 |
| %assign push_num push_num-1 |
| mov r0, r11 |
| mov r1, r12 |
| add r0, r13 |
| add r1, r13 |
| dec r3 |
| jnz var_height_loop |
| |
| mov r15, psadframe |
| movdqa xmm5, xmm7 |
| psrldq xmm7, 8 |
| paddd xmm7, xmm5 |
| movd [r15], xmm7 |
| |
| POP_XMM |
| pop r15 |
| pop r14 |
| pop r13 |
| pop r12 |
| %assign push_num 0 |
| %undef cur_data |
| %undef ref_data |
| %undef iPicWidth |
| %undef iPicHeight |
| %undef iPicStride |
| %undef psadframe |
| %undef psad8x8 |
| %undef psum16x16 |
| %undef psqsum16x16 |
| %undef tmp_esi |
| %undef tmp_edi |
| %undef pushsize |
| %undef localsize |
| ret |
| |
| %endif |
| |
| %ifdef X86_32 |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, |
| ; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16) |
| ;************************************************************************************************************* |
| |
| |
| WELS_EXTERN VAACalcSadSsd_sse2 |
| %define localsize 12 |
| %define cur_data esp + pushsize + localsize + 4 |
| %define ref_data esp + pushsize + localsize + 8 |
| %define iPicWidth esp + pushsize + localsize + 12 |
| %define iPicHeight esp + pushsize + localsize + 16 |
| %define iPicStride esp + pushsize + localsize + 20 |
| %define psadframe esp + pushsize + localsize + 24 |
| %define psad8x8 esp + pushsize + localsize + 28 |
| %define psum16x16 esp + pushsize + localsize + 32 |
| %define psqsum16x16 esp + pushsize + localsize + 36 |
| %define psqdiff16x16 esp + pushsize + localsize + 40 |
| %define tmp_esi esp + 0 |
| %define tmp_edi esp + 4 |
| %define tmp_sadframe esp + 8 |
| %define pushsize 16 |
| push ebp |
| push esi |
| push edi |
| push ebx |
| sub esp, localsize |
| |
| mov ecx, [iPicWidth] |
| mov ecx, [iPicHeight] |
| mov esi, [cur_data] |
| mov edi, [ref_data] |
| mov ebx, [iPicStride] |
| mov edx, [psad8x8] |
| mov eax, ebx |
| |
| shr dword [iPicWidth], 4 ; iPicWidth/16 |
| shr dword [iPicHeight], 4 ; iPicHeight/16 |
| shl eax, 4 ; iPicStride*16 |
| mov ecx, [iPicWidth] |
| mov ecx, [iPicHeight] |
| pxor xmm0, xmm0 |
| movd [tmp_sadframe], xmm0 |
| sqdiff_height_loop: |
| mov ecx, dword [iPicWidth] |
| mov [tmp_esi], esi |
| mov [tmp_edi], edi |
| sqdiff_width_loop: |
| pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8 |
| pxor xmm6, xmm6 ; pSum16x16 |
| pxor xmm5, xmm5 ; sqsum_16x16 four dword |
| pxor xmm4, xmm4 ; sqdiff_16x16 four Dword |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| movdqa xmm1, xmm7 |
| movd [edx], xmm7 |
| psrldq xmm7, 8 |
| paddd xmm1, xmm7 |
| movd [edx+4], xmm7 |
| movd ebp, xmm1 |
| add [tmp_sadframe], ebp |
| |
| pxor xmm7, xmm7 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx |
| movdqa xmm1, xmm7 |
| movd [edx+8], xmm7 |
| psrldq xmm7, 8 |
| paddd xmm1, xmm7 |
| movd [edx+12], xmm7 |
| movd ebp, xmm1 |
| add [tmp_sadframe], ebp |
| |
| mov ebp, [psum16x16] |
| movdqa xmm1, xmm6 |
| psrldq xmm1, 8 |
| paddd xmm6, xmm1 |
| movd [ebp], xmm6 |
| add dword [psum16x16], 4 |
| |
| mov ebp, [psqsum16x16] |
| pshufd xmm6, xmm5, 14 ;00001110 |
| paddd xmm6, xmm5 |
| pshufd xmm5, xmm6, 1 ;00000001 |
| paddd xmm5, xmm6 |
| movd [ebp], xmm5 |
| add dword [psqsum16x16], 4 |
| |
| mov ebp, [psqdiff16x16] |
| pshufd xmm5, xmm4, 14 ; 00001110 |
| paddd xmm5, xmm4 |
| pshufd xmm4, xmm5, 1 ; 00000001 |
| paddd xmm4, xmm5 |
| movd [ebp], xmm4 |
| add dword [psqdiff16x16], 4 |
| |
| add edx, 16 |
| sub esi, eax |
| sub edi, eax |
| add esi, 16 |
| add edi, 16 |
| |
| dec ecx |
| jnz sqdiff_width_loop |
| |
| mov esi, [tmp_esi] |
| mov edi, [tmp_edi] |
| add esi, eax |
| add edi, eax |
| |
| dec dword [iPicHeight] |
| jnz sqdiff_height_loop |
| |
| mov ebx, [tmp_sadframe] |
| mov eax, [psadframe] |
| mov [eax], ebx |
| |
| add esp, localsize |
| pop ebx |
| pop edi |
| pop esi |
| pop ebp |
| %undef cur_data |
| %undef ref_data |
| %undef iPicWidth |
| %undef iPicHeight |
| %undef iPicStride |
| %undef psadframe |
| %undef psad8x8 |
| %undef psum16x16 |
| %undef psqsum16x16 |
| %undef psqdiff16x16 |
| %undef tmp_esi |
| %undef tmp_edi |
| %undef tmp_sadframe |
| %undef pushsize |
| %undef localsize |
| ret |
| |
| %else |
| |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, |
| ; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16) |
| ;************************************************************************************************************* |
| |
| |
| WELS_EXTERN VAACalcSadSsd_sse2 |
| %define localsize 12 |
| %define cur_data arg1;r0 |
| %define ref_data arg2;r1 |
| %define iPicWidth arg3;r2 |
| %define iPicHeight arg4;r3 |
| %define iPicStride arg5; |
| %define psadframe arg6; |
| %define psad8x8 arg7; |
| %define psum16x16 arg8; |
| %define psqsum16x16 arg9; |
| %define psqdiff16x16 arg10 |
| |
| push r12 |
| push r13 |
| push r14 |
| push r15 |
| %assign push_num 4 |
| PUSH_XMM 10 |
| |
| %ifdef WIN64 |
| mov r4,arg5 |
| %endif |
| mov r14,arg7 |
| SIGN_EXTENSION r2,r2d |
| SIGN_EXTENSION r3,r3d |
| SIGN_EXTENSION r4,r4d |
| |
| mov r13,r4 |
| shr r2,4 ; iPicWidth/16 |
| shr r3,4 ; iPicHeight/16 |
| shl r13,4 ; iPicStride*16 |
| pxor xmm0, xmm0 |
| pxor xmm8, xmm8 ;framesad |
| pxor xmm9, xmm9 |
| sqdiff_height_loop: |
| ;mov ecx, dword [iPicWidth] |
| ;mov r14,r2 |
| push r2 |
| %assign push_num push_num +1 |
| mov r10, r0 |
| mov r11, r1 |
| sqdiff_width_loop: |
| pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8 |
| pxor xmm6, xmm6 ; pSum16x16 |
| pxor xmm5, xmm5 ; sqsum_16x16 four dword |
| pxor xmm4, xmm4 ; sqdiff_16x16 four Dword |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| movdqa xmm1, xmm7 |
| movd [r14], xmm7 |
| psrldq xmm7, 8 |
| paddd xmm1, xmm7 |
| movd [r14+4], xmm7 |
| movd r15d, xmm1 |
| movd xmm9, r15d |
| paddd xmm8,xmm9 |
| |
| |
| pxor xmm7, xmm7 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 |
| movdqa xmm1, xmm7 |
| movd [r14+8], xmm7 |
| psrldq xmm7, 8 |
| paddd xmm1, xmm7 |
| movd [r14+12], xmm7 |
| movd r15d, xmm1 |
| movd xmm9, r15d |
| paddd xmm8,xmm9 |
| |
| mov r15, psum16x16 |
| movdqa xmm1, xmm6 |
| psrldq xmm1, 8 |
| paddd xmm6, xmm1 |
| movd [r15], xmm6 |
| add dword psum16x16, 4 |
| |
| mov r15, psqsum16x16 |
| pshufd xmm6, xmm5, 14 ;00001110 |
| paddd xmm6, xmm5 |
| pshufd xmm5, xmm6, 1 ;00000001 |
| paddd xmm5, xmm6 |
| movd [r15], xmm5 |
| add dword psqsum16x16, 4 |
| |
| mov r15, psqdiff16x16 |
| pshufd xmm5, xmm4, 14 ; 00001110 |
| paddd xmm5, xmm4 |
| pshufd xmm4, xmm5, 1 ; 00000001 |
| paddd xmm4, xmm5 |
| movd [r15], xmm4 |
| add dword psqdiff16x16, 4 |
| |
| add r14,16 |
| sub r0, r13 |
| sub r1, r13 |
| add r0, 16 |
| add r1, 16 |
| |
| dec r2 |
| jnz sqdiff_width_loop |
| |
| pop r2 |
| %assign push_num push_num -1 |
| |
| mov r0, r10 |
| mov r1, r11 |
| add r0, r13 |
| add r1, r13 |
| |
| dec r3 |
| jnz sqdiff_height_loop |
| |
| mov r13, psadframe |
| movd [r13], xmm8 |
| |
| POP_XMM |
| pop r15 |
| pop r14 |
| pop r13 |
| pop r12 |
| %assign push_num 0 |
| |
| %undef cur_data |
| %undef ref_data |
| %undef iPicWidth |
| %undef iPicHeight |
| %undef iPicStride |
| %undef psadframe |
| %undef psad8x8 |
| %undef psum16x16 |
| %undef psqsum16x16 |
| %undef psqdiff16x16 |
| %undef tmp_esi |
| %undef tmp_edi |
| %undef tmp_sadframe |
| %undef pushsize |
| %undef localsize |
| ret |
| |
| |
| |
| %endif |
| |
| %ifdef X86_32 |
| ;************************************************************************************************************* |
| ;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8) |
| ;************************************************************************************************************* |
| |
| |
| WELS_EXTERN VAACalcSadBgd_sse2 |
| %define localsize 12 |
| %define cur_data esp + pushsize + localsize + 4 |
| %define ref_data esp + pushsize + localsize + 8 |
| %define iPicWidth esp + pushsize + localsize + 12 |
| %define iPicHeight esp + pushsize + localsize + 16 |
| %define iPicStride esp + pushsize + localsize + 20 |
| %define psadframe esp + pushsize + localsize + 24 |
| %define psad8x8 esp + pushsize + localsize + 28 |
| %define p_sd8x8 esp + pushsize + localsize + 32 |
| %define p_mad8x8 esp + pushsize + localsize + 36 |
| %define tmp_esi esp + 0 |
| %define tmp_edi esp + 4 |
| %define tmp_ecx esp + 8 |
| %define pushsize 16 |
| push ebp |
| push esi |
| push edi |
| push ebx |
| sub esp, localsize |
| mov esi, [cur_data] |
| mov edi, [ref_data] |
| mov ebx, [iPicStride] |
| mov eax, ebx |
| |
| shr dword [iPicWidth], 4 ; iPicWidth/16 |
| shr dword [iPicHeight], 4 ; iPicHeight/16 |
| shl eax, 4 ; iPicStride*16 |
| xor ebp, ebp |
| pxor xmm0, xmm0 |
| bgd_height_loop: |
| mov ecx, dword [iPicWidth] |
| mov [tmp_esi], esi |
| mov [tmp_edi], edi |
| bgd_width_loop: |
| pxor xmm7, xmm7 ; pSad8x8 |
| pxor xmm6, xmm6 ; sum_cur_8x8 |
| pxor xmm5, xmm5 ; sum_ref_8x8 |
| pxor xmm4, xmm4 ; pMad8x8 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| |
| |
| mov edx, [p_mad8x8] |
| WELS_MAX_REG_SSE2 xmm4 |
| |
| ;movdqa xmm1, xmm4 |
| ;punpcklbw xmm1, xmm0 |
| ;punpcklwd xmm1, xmm0 |
| ;movd [edx], xmm1 |
| ;punpckhbw xmm4, xmm0 |
| ;punpcklwd xmm4, xmm0 |
| ;movd [edx+4], xmm4 |
| ;add edx, 8 |
| ;mov [p_mad8x8], edx |
| mov [tmp_ecx], ecx |
| movhlps xmm1, xmm4 |
| movd ecx, xmm4 |
| mov [edx], cl |
| movd ecx, xmm1 |
| mov [edx+1],cl |
| add edx, 2 |
| mov [p_mad8x8], edx |
| |
| |
| pslldq xmm7, 4 |
| pslldq xmm6, 4 |
| pslldq xmm5, 4 |
| |
| |
| pxor xmm4, xmm4 ; pMad8x8 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx |
| |
| mov edx, [p_mad8x8] |
| WELS_MAX_REG_SSE2 xmm4 |
| |
| ;movdqa xmm1, xmm4 |
| ;punpcklbw xmm1, xmm0 |
| ;punpcklwd xmm1, xmm0 |
| ;movd [edx], xmm1 |
| ;punpckhbw xmm4, xmm0 |
| ;punpcklwd xmm4, xmm0 |
| ;movd [edx+4], xmm4 |
| ;add edx, 8 |
| ;mov [p_mad8x8], edx |
| movhlps xmm1, xmm4 |
| movd ecx, xmm4 |
| mov [edx], cl |
| movd ecx, xmm1 |
| mov [edx+1],cl |
| add edx, 2 |
| mov [p_mad8x8], edx |
| |
| ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2 |
| |
| mov edx, [psad8x8] |
| pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0 |
| movdqa [edx], xmm1 |
| add edx, 16 |
| mov [psad8x8], edx ; sad8x8 |
| |
| paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0 |
| pshufd xmm2, xmm1, 00000011b |
| paddd xmm1, xmm2 |
| movd edx, xmm1 |
| add ebp, edx ; sad frame |
| |
| mov edx, [p_sd8x8] |
| psubd xmm6, xmm5 |
| pshufd xmm1, xmm6, 10001101b |
| movdqa [edx], xmm1 |
| add edx, 16 |
| mov [p_sd8x8], edx |
| |
| |
| add edx, 16 |
| sub esi, eax |
| sub edi, eax |
| add esi, 16 |
| add edi, 16 |
| |
| mov ecx, [tmp_ecx] |
| dec ecx |
| jnz bgd_width_loop |
| |
| mov esi, [tmp_esi] |
| mov edi, [tmp_edi] |
| add esi, eax |
| add edi, eax |
| |
| dec dword [iPicHeight] |
| jnz bgd_height_loop |
| |
| mov edx, [psadframe] |
| mov [edx], ebp |
| |
| add esp, localsize |
| pop ebx |
| pop edi |
| pop esi |
| pop ebp |
| %undef cur_data |
| %undef ref_data |
| %undef iPicWidth |
| %undef iPicHeight |
| %undef iPicStride |
| %undef psadframe |
| %undef psad8x8 |
| %undef p_sd8x8 |
| %undef p_mad8x8 |
| %undef tmp_esi |
| %undef tmp_edi |
| %undef pushsize |
| %undef localsize |
| ret |
| |
| |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, |
| ; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8) |
| ;************************************************************************************************************* |
| |
| |
| WELS_EXTERN VAACalcSadSsdBgd_sse2 |
| %define localsize 16 |
| %define cur_data esp + pushsize + localsize + 4 |
| %define ref_data esp + pushsize + localsize + 8 |
| %define iPicWidth esp + pushsize + localsize + 12 |
| %define iPicHeight esp + pushsize + localsize + 16 |
| %define iPicStride esp + pushsize + localsize + 20 |
| %define psadframe esp + pushsize + localsize + 24 |
| %define psad8x8 esp + pushsize + localsize + 28 |
| %define psum16x16 esp + pushsize + localsize + 32 |
| %define psqsum16x16 esp + pushsize + localsize + 36 |
| %define psqdiff16x16 esp + pushsize + localsize + 40 |
| %define p_sd8x8 esp + pushsize + localsize + 44 |
| %define p_mad8x8 esp + pushsize + localsize + 48 |
| %define tmp_esi esp + 0 |
| %define tmp_edi esp + 4 |
| %define tmp_sadframe esp + 8 |
| %define tmp_ecx esp + 12 |
| %define pushsize 16 |
| push ebp |
| push esi |
| push edi |
| push ebx |
| sub esp, localsize |
| mov esi, [cur_data] |
| mov edi, [ref_data] |
| mov ebx, [iPicStride] |
| mov eax, ebx |
| |
| shr dword [iPicWidth], 4 ; iPicWidth/16 |
| shr dword [iPicHeight], 4 ; iPicHeight/16 |
| shl eax, 4 ; iPicStride*16 |
| pxor xmm0, xmm0 |
| movd [tmp_sadframe], xmm0 |
| sqdiff_bgd_height_loop: |
| mov ecx, dword [iPicWidth] |
| mov [tmp_esi], esi |
| mov [tmp_edi], edi |
| sqdiff_bgd_width_loop: |
| pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0 |
| pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 |
| pxor xmm5, xmm5 ; pMad8x8 |
| pxor xmm4, xmm4 ; sqdiff_16x16 four Dword |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| |
| mov edx, [psad8x8] |
| movdqa xmm2, xmm7 |
| pshufd xmm1, xmm2, 00001110b |
| movd [edx], xmm2 |
| movd [edx+4], xmm1 |
| add edx, 8 |
| mov [psad8x8], edx ; sad8x8 |
| |
| paddd xmm1, xmm2 |
| movd edx, xmm1 |
| add [tmp_sadframe], edx ; iFrameSad |
| |
| mov edx, [psum16x16] |
| movdqa xmm1, xmm6 |
| pshufd xmm2, xmm1, 00001110b |
| paddd xmm1, xmm2 |
| movd [edx], xmm1 ; sum |
| |
| mov edx, [p_sd8x8] |
| pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 |
| psubd xmm6, xmm1 ; 00 diff1 00 diff0 |
| pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 |
| movq [edx], xmm1 |
| add edx, 8 |
| mov [p_sd8x8], edx |
| |
| mov edx, [p_mad8x8] |
| WELS_MAX_REG_SSE2 xmm5 |
| ;movdqa xmm1, xmm5 |
| ;punpcklbw xmm1, xmm0 |
| ;punpcklwd xmm1, xmm0 |
| ;movd [edx], xmm1 |
| ;punpckhbw xmm5, xmm0 |
| ;punpcklwd xmm5, xmm0 |
| ;movd [edx+4], xmm5 |
| ;add edx, 8 |
| ;mov [p_mad8x8], edx |
| mov [tmp_ecx], ecx |
| movhlps xmm1, xmm5 |
| movd ecx, xmm5 |
| mov [edx], cl |
| movd ecx, xmm1 |
| mov [edx+1],cl |
| add edx, 2 |
| mov [p_mad8x8], edx |
| |
| psrlq xmm7, 32 |
| psllq xmm7, 32 ; clear sad |
| pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 |
| pxor xmm5, xmm5 ; pMad8x8 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx |
| |
| mov edx, [psad8x8] |
| movdqa xmm2, xmm7 |
| pshufd xmm1, xmm2, 00001110b |
| movd [edx], xmm2 |
| movd [edx+4], xmm1 |
| add edx, 8 |
| mov [psad8x8], edx ; sad8x8 |
| |
| paddd xmm1, xmm2 |
| movd edx, xmm1 |
| add [tmp_sadframe], edx ; iFrameSad |
| |
| mov edx, [psum16x16] |
| movdqa xmm1, xmm6 |
| pshufd xmm2, xmm1, 00001110b |
| paddd xmm1, xmm2 |
| movd ebp, xmm1 ; sum |
| add [edx], ebp |
| add edx, 4 |
| mov [psum16x16], edx |
| |
| mov edx, [psqsum16x16] |
| psrlq xmm7, 32 |
| pshufd xmm2, xmm7, 00001110b |
| paddd xmm2, xmm7 |
| movd [edx], xmm2 ; sqsum |
| add edx, 4 |
| mov [psqsum16x16], edx |
| |
| mov edx, [p_sd8x8] |
| pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 |
| psubd xmm6, xmm1 ; 00 diff1 00 diff0 |
| pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 |
| movq [edx], xmm1 |
| add edx, 8 |
| mov [p_sd8x8], edx |
| |
| mov edx, [p_mad8x8] |
| WELS_MAX_REG_SSE2 xmm5 |
| ;movdqa xmm1, xmm5 |
| ;punpcklbw xmm1, xmm0 |
| ;punpcklwd xmm1, xmm0 |
| ;movd [edx], xmm1 |
| ;punpckhbw xmm5, xmm0 |
| ;punpcklwd xmm5, xmm0 |
| ;movd [edx+4], xmm5 |
| ;add edx, 8 |
| ;mov [p_mad8x8], edx |
| movhlps xmm1, xmm5 |
| movd ecx, xmm5 |
| mov [edx], cl |
| movd ecx, xmm1 |
| mov [edx+1],cl |
| add edx, 2 |
| mov [p_mad8x8], edx |
| |
| mov edx, [psqdiff16x16] |
| pshufd xmm1, xmm4, 00001110b |
| paddd xmm4, xmm1 |
| pshufd xmm1, xmm4, 00000001b |
| paddd xmm4, xmm1 |
| movd [edx], xmm4 |
| add edx, 4 |
| mov [psqdiff16x16], edx |
| |
| add edx, 16 |
| sub esi, eax |
| sub edi, eax |
| add esi, 16 |
| add edi, 16 |
| |
| mov ecx, [tmp_ecx] |
| dec ecx |
| jnz sqdiff_bgd_width_loop |
| |
| mov esi, [tmp_esi] |
| mov edi, [tmp_edi] |
| add esi, eax |
| add edi, eax |
| |
| dec dword [iPicHeight] |
| jnz sqdiff_bgd_height_loop |
| |
| mov edx, [psadframe] |
| mov ebp, [tmp_sadframe] |
| mov [edx], ebp |
| |
| add esp, localsize |
| pop ebx |
| pop edi |
| pop esi |
| pop ebp |
| %undef cur_data |
| %undef ref_data |
| %undef iPicWidth |
| %undef iPicHeight |
| %undef iPicStride |
| %undef psadframe |
| %undef psad8x8 |
| %undef psum16x16 |
| %undef psqsum16x16 |
| %undef psqdiff16x16 |
| %undef p_sd8x8 |
| %undef p_mad8x8 |
| %undef tmp_esi |
| %undef tmp_edi |
| %undef pushsize |
| %undef localsize |
| ret |
| %else |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8) |
| ;************************************************************************************************************* |
| |
| |
| WELS_EXTERN VAACalcSadBgd_sse2 |
| %define cur_data arg1; |
| %define ref_data arg2; |
| %define iPicWidth arg3; |
| %define iPicHeight arg4; |
| %define iPicStride arg5; |
| %define psadframe arg6; |
| %define psad8x8 arg7; |
| %define p_sd8x8 arg8; |
| %define p_mad8x8 arg9; |
| |
| push r12 |
| push r13 |
| push r14 |
| push r15 |
| %assign push_num 4 |
| PUSH_XMM 10 |
| %ifdef WIN64 |
| mov r4,arg5 |
| ; mov r5,arg6 |
| %endif |
| mov r14,arg7 |
| SIGN_EXTENSION r2,r2d |
| SIGN_EXTENSION r3,r3d |
| SIGN_EXTENSION r4,r4d |
| |
| |
| mov r13,r4 |
| mov r15,r0 |
| shr r2,4 |
| shr r3,4 |
| shl r13,4 |
| pxor xmm0, xmm0 |
| pxor xmm8, xmm8 |
| pxor xmm9, xmm9 |
| bgd_height_loop: |
| ;mov ecx, dword [iPicWidth] |
| push r2 |
| %assign push_num push_num+1 |
| mov r10, r15 |
| mov r11, r1 |
| bgd_width_loop: |
| pxor xmm7, xmm7 ; pSad8x8 |
| pxor xmm6, xmm6 ; sum_cur_8x8 |
| pxor xmm5, xmm5 ; sum_ref_8x8 |
| pxor xmm4, xmm4 ; pMad8x8 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| |
| |
| mov r14, p_mad8x8 |
| WELS_MAX_REG_SSE2 xmm4 |
| |
| ;mov [tmp_ecx], ecx |
| movhlps xmm1, xmm4 |
| movd r0d, xmm4 |
| |
| |
| mov [r14], r0b |
| movd r0d, xmm1 |
| mov [r14+1],r0b |
| add r14, 2 |
| ;mov p_mad8x8, r14 |
| |
| |
| pslldq xmm7, 4 |
| pslldq xmm6, 4 |
| pslldq xmm5, 4 |
| |
| |
| pxor xmm4, xmm4 ; pMad8x8 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 |
| |
| ;mov r14, [p_mad8x8] |
| WELS_MAX_REG_SSE2 xmm4 |
| |
| movhlps xmm1, xmm4 |
| movd r0d, xmm4 |
| mov [r14], r0b |
| movd r0d, xmm1 |
| mov [r14+1],r0b |
| add r14, 2 |
| mov p_mad8x8, r14 |
| |
| ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2 |
| |
| mov r14, psad8x8 |
| pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0 |
| movdqa [r14], xmm1 |
| add r14, 16 |
| mov psad8x8, r14 ; sad8x8 |
| |
| paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0 |
| pshufd xmm2, xmm1, 00000011b |
| paddd xmm1, xmm2 |
| movd r14d, xmm1 |
| movd xmm9, r14d |
| paddd xmm8, xmm9 ; sad frame |
| |
| mov r14, p_sd8x8 |
| psubd xmm6, xmm5 |
| pshufd xmm1, xmm6, 10001101b |
| movdqa [r14], xmm1 |
| add r14, 16 |
| mov p_sd8x8, r14 |
| |
| |
| ;add edx, 16 |
| sub r15, r13 |
| sub r1, r13 |
| add r15, 16 |
| add r1, 16 |
| |
| |
| dec r2 |
| jnz bgd_width_loop |
| pop r2 |
| %assign push_num push_num-1 |
| mov r15, r10 |
| mov r1, r11 |
| add r15, r13 |
| add r1, r13 |
| |
| dec r3 |
| jnz bgd_height_loop |
| |
| mov r13, psadframe |
| movd [r13], xmm8 |
| |
| POP_XMM |
| pop r15 |
| pop r14 |
| pop r13 |
| pop r12 |
| %assign push_num 0 |
| %undef cur_data |
| %undef ref_data |
| %undef iPicWidth |
| %undef iPicHeight |
| %undef iPicStride |
| %undef psadframe |
| %undef psad8x8 |
| %undef p_sd8x8 |
| %undef p_mad8x8 |
| %undef tmp_esi |
| %undef tmp_edi |
| %undef pushsize |
| %undef localsize |
| ret |
| |
| |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, |
| ; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8) |
| ;************************************************************************************************************* |
| |
| |
| WELS_EXTERN VAACalcSadSsdBgd_sse2 |
| %define cur_data arg1; |
| %define ref_data arg2; |
| %define iPicWidth arg3; |
| %define iPicHeight arg4; |
| %define iPicStride arg5; |
| %define psadframe arg6; |
| %define psad8x8 arg7; |
| %define psum16x16 arg8; |
| %define psqsum16x16 arg9; |
| %define psqdiff16x16 arg10; |
| %define p_sd8x8 arg11 |
| %define p_mad8x8 arg12 |
| |
| push r12 |
| push r13 |
| push r14 |
| push r15 |
| %assign push_num 4 |
| PUSH_XMM 10 |
| %ifdef WIN64 |
| mov r4,arg5 |
| ;mov r5,arg6 |
| %endif |
| SIGN_EXTENSION r2,r2d |
| SIGN_EXTENSION r3,r3d |
| SIGN_EXTENSION r4,r4d |
| |
| mov r13,r4 |
| shr r2, 4 ; iPicWidth/16 |
| shr r3, 4 ; iPicHeight/16 |
| shl r13, 4 ; iPicStride*16 |
| pxor xmm0, xmm0 |
| pxor xmm8, xmm8 |
| pxor xmm9, xmm9 |
| |
| |
| sqdiff_bgd_height_loop: |
| mov r10, r0 |
| mov r11, r1 |
| push r2 |
| %assign push_num push_num+1 |
| sqdiff_bgd_width_loop: |
| |
| pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0 |
| pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 |
| pxor xmm5, xmm5 ; pMad8x8 |
| pxor xmm4, xmm4 ; sqdiff_16x16 four Dword |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| |
| mov r14, psad8x8 |
| movdqa xmm2, xmm7 |
| pshufd xmm1, xmm2, 00001110b |
| movd [r14], xmm2 |
| movd [r14+4], xmm1 |
| add r14, 8 |
| mov psad8x8, r14 ; sad8x8 |
| |
| paddd xmm1, xmm2 |
| movd r14d, xmm1 |
| movd xmm9,r14d |
| paddd xmm8, xmm9 ; iFrameSad |
| |
| mov r14, psum16x16 |
| movdqa xmm1, xmm6 |
| pshufd xmm2, xmm1, 00001110b |
| paddd xmm1, xmm2 |
| movd [r14], xmm1 ; sum |
| |
| mov r14, p_sd8x8 |
| pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 |
| psubd xmm6, xmm1 ; 00 diff1 00 diff0 |
| pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 |
| movq [r14], xmm1 |
| add r14, 8 |
| mov p_sd8x8, r14 |
| |
| mov r14, p_mad8x8 |
| WELS_MAX_REG_SSE2 xmm5 |
| |
| movhlps xmm1, xmm5 |
| push r0 |
| movd r0d, xmm5 |
| mov [r14], r0b |
| movd r0d, xmm1 |
| mov [r14+1],r0b |
| pop r0 |
| add r14, 2 |
| mov p_mad8x8, r14 |
| |
| psrlq xmm7, 32 |
| psllq xmm7, 32 ; clear sad |
| pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 |
| pxor xmm5, xmm5 ; pMad8x8 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 |
| |
| mov r14, psad8x8 |
| movdqa xmm2, xmm7 |
| pshufd xmm1, xmm2, 00001110b |
| movd [r14], xmm2 |
| movd [r14+4], xmm1 |
| add r14, 8 |
| mov psad8x8, r14 ; sad8x8 |
| |
| paddd xmm1, xmm2 |
| movd r14d, xmm1 |
| movd xmm9, r14d |
| paddd xmm8, xmm9 ; iFrameSad |
| |
| mov r14, psum16x16 |
| movdqa xmm1, xmm6 |
| pshufd xmm2, xmm1, 00001110b |
| paddd xmm1, xmm2 |
| movd r15d, xmm1 ; sum |
| add [r14], r15d |
| add r14, 4 |
| mov psum16x16, r14 |
| |
| mov r14, psqsum16x16 |
| psrlq xmm7, 32 |
| pshufd xmm2, xmm7, 00001110b |
| paddd xmm2, xmm7 |
| movd [r14], xmm2 ; sqsum |
| add r14, 4 |
| mov psqsum16x16, r14 |
| |
| mov r14, p_sd8x8 |
| pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 |
| psubd xmm6, xmm1 ; 00 diff1 00 diff0 |
| pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 |
| movq [r14], xmm1 |
| add r14, 8 |
| mov p_sd8x8, r14 |
| |
| mov r14, p_mad8x8 |
| WELS_MAX_REG_SSE2 xmm5 |
| |
| |
| movhlps xmm1, xmm5 |
| push r0 |
| movd r0d, xmm5 |
| mov [r14], r0b |
| movd r0d, xmm1 |
| mov [r14+1],r0b |
| pop r0 |
| add r14, 2 |
| mov p_mad8x8, r14 |
| |
| mov r14, psqdiff16x16 |
| pshufd xmm1, xmm4, 00001110b |
| paddd xmm4, xmm1 |
| pshufd xmm1, xmm4, 00000001b |
| paddd xmm4, xmm1 |
| movd [r14], xmm4 |
| add r14, 4 |
| mov psqdiff16x16, r14 |
| |
| add r14, 16 |
| sub r0, r13 |
| sub r1, r13 |
| add r0, 16 |
| add r1, 16 |
| |
| dec r2 |
| jnz sqdiff_bgd_width_loop |
| pop r2 |
| %assign push_num push_num-1 |
| mov r0, r10 |
| mov r1, r11 |
| add r0, r13 |
| add r1, r13 |
| |
| dec r3 |
| jnz sqdiff_bgd_height_loop |
| |
| mov r14, psadframe |
| movd [r14], xmm8 |
| |
| POP_XMM |
| pop r15 |
| pop r14 |
| pop r13 |
| pop r12 |
| %assign push_num 0 |
| %undef cur_data |
| %undef ref_data |
| %undef iPicWidth |
| %undef iPicHeight |
| %undef iPicStride |
| %undef psadframe |
| %undef psad8x8 |
| %undef psum16x16 |
| %undef psqsum16x16 |
| %undef psqdiff16x16 |
| %undef p_sd8x8 |
| %undef p_mad8x8 |
| %undef tmp_esi |
| %undef tmp_edi |
| %undef pushsize |
| %undef localsize |
| ret |
| %endif |
| |
| %ifdef X86_32 |
| %define ptrword dword |
| %else |
| %define ptrword qword |
| %endif |
| |
| %define xmm_width 16 |
| %define ymm_width 32 |
| |
| %macro PUSHM 1-* |
| %rep %0 |
| push %1 |
| %rotate 1 |
| %endrep |
| %assign push_num push_num + %0 |
| %endmacro |
| |
| %macro POPM 1-* |
| %rep %0 |
| %rotate -1 |
| pop %1 |
| %endrep |
| %assign push_num push_num - %0 |
| %endmacro |
| |
| %ifdef X86_32 |
| %define stack_alloc_min 4 |
| %else |
| %define stack_alloc_min 8 |
| %endif |
| |
| ; Allocate aligned stack space. |
| ; address_out=%1 size=%2 alignment=%3 |
| %macro STACK_ALLOC 3 |
| %if (%3) & ((%3) - 1) |
| %error non-power-of-2 alignment requested. |
| %endif |
| %if (%3) > 0 |
| %assign stack_alloc_align ((%3) + stack_alloc_min - 1) / stack_alloc_min |
| %else |
| %assign stack_alloc_align 1 |
| %endif |
| %assign stack_alloc_num ((%2) + stack_alloc_min - 1) / stack_alloc_min + stack_alloc_align - 1 |
| %assign push_num push_num + stack_alloc_num |
| sub r7, stack_alloc_min * stack_alloc_num |
| %if stack_alloc_align == 1 |
| mov %1, r7 |
| %else |
| lea %1, [r7 + stack_alloc_min * (stack_alloc_align - 1)] |
| and %1, -(stack_alloc_min * stack_alloc_align) |
| %endif |
| %endmacro |
| |
| ; Deallocate stack space allocated with STACK_ALLOC. |
| %macro STACK_DEALLOC 0 |
| add r7, stack_alloc_min * stack_alloc_num |
| %assign push_num push_num - stack_alloc_num |
| %endmacro |
| |
| %ifdef HAVE_AVX2 |
| ; Max unsigned byte per quadword |
| ; out=%1 in=%2 tmp=%3 |
| %macro AVX2_Maxubq 3 |
| vpsrlq %3, %2, 32 |
| vpmaxub %1, %2, %3 |
| vpsrlq %3, %1, 16 |
| vpmaxub %1, %1, %3 |
| vpsrlq %3, %1, 8 |
| vpmaxub %1, %1, %3 |
| %endmacro |
| |
| ; Max unsigned byte per quadword. 2 register input. |
| ; Results interleaved as least significant byte of even/odd doublewords. |
| ; out=%1 in_a=%2 in_b=%3 tmp=%4 |
| %macro AVX2_Maxubq2 4 |
| vpblendd %4, %2, %3, 10101010b |
| vpshufd %4, %4, 10110001b |
| vpblendd %1, %2, %3, 01010101b |
| vpmaxub %1, %4, %1 |
| vpsrld %4, %1, 16 |
| vpmaxub %1, %1, %4 |
| vpsrld %4, %1, 8 |
| vpmaxub %1, %1, %4 |
| %endmacro |
| |
| ; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5 |
| %macro AVX2_Sqsumbdw 5 |
| vpunpcklbw %4, %2, %3 |
| %if %5 |
| vpmaddwd %4, %4, %4 |
| vpaddd %1, %1, %4 |
| %else |
| vpmaddwd %1, %4, %4 |
| %endif |
| vpunpckhbw %4, %2, %3 |
| vpmaddwd %4, %4, %4 |
| vpaddd %1, %1, %4 |
| %endmacro |
| |
| ; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5 |
| %macro AVX2_Sumbdw 5 |
| %if %5 |
| vpsadbw %4, %2, %3 |
| vpaddd %1, %1, %4 |
| %else |
| vpsadbw %1, %2, %3 |
| %endif |
| %endmacro |
| |
| ; res=%1 a=%2 b=%3 a=%4 tmp=%5 |
| %macro AVX2_AbsDiffub 5 |
| vpsubusb %5, %2, %3 |
| vpsubusb %1, %3, %4 |
| vpor %1, %5, %1 |
| %endmacro |
| |
| ; sad=%1 cur_data=%2 ref_data=%3 tmp=%4 accumulate_results=%5 |
| %macro AVX2_Sadbdw 5 |
| %if %5 |
| vpsadbw %4, %2, %3 |
| vpaddd %1, %1, %4 |
| %else |
| vpsadbw %1, %2, %3 |
| %endif |
| %endmacro |
| |
| ; sad=%1 sum_cur=%2 sqsum_cur=%3 cur_data=%4 ref_data=%5 zero=%6 tmp=%7 accumulate_results=%8 |
| %macro AVX2_SadSumSqsumbdw 8 |
| AVX2_Sadbdw %1, %4, %5, %7, %8 |
| AVX2_Sumbdw %2, %4, %6, %7, %8 |
| AVX2_Sqsumbdw %3, %4, %6, %7, %8 |
| %endmacro |
| |
| ; sad=%1 pCur=%2 pRef=%3 tmp=%4 accumulate_results=%5 |
| %macro AVX2_Sad 5 |
| vmovdqu %4, [%2] |
| AVX2_Sadbdw %1, %4, [%3], %4, %5 |
| %endmacro |
| |
| ; sad=%1 sum_cur=%2 sqsum_cur=%3 pCur=%4 pRef=%5 zero=%6 tmp=%7,%8 accumulate_results=%9 |
| %macro AVX2_SadSumSqsum 9 |
| vmovdqu %7, [%4] |
| AVX2_SadSumSqsumbdw %1, %2, %3, %7, [%5], %6, %8, %9 |
| %endmacro |
| |
| ; sad=%1 sum_cur=%2 sqsum_cur=%3 sqdiff=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11 |
| %macro AVX2_SadSumSqsumSqdiff 11 |
| vmovdqu %8, [%5] |
| vmovdqu %9, [%6] |
| AVX2_SadSumSqsumbdw %1, %2, %3, %8, %9, %7, %10, %11 |
| AVX2_AbsDiffub %9, %8, %9, %8, %10 |
| AVX2_Sqsumbdw %4, %9, %7, %10, %11 |
| %endmacro |
| |
| ; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11 |
| %macro AVX2_SadSdMad 11 |
| vmovdqu %8, [%5] |
| vmovdqu %9, [%6] |
| AVX2_Sumbdw %2, %8, %7, %10, %11 |
| AVX2_Sumbdw %3, %9, %7, %10, %11 |
| AVX2_Sadbdw %1, %8, %9, %10, %11 |
| %if %11 |
| AVX2_AbsDiffub %9, %8, %9, %8, %10 |
| vpmaxub %4, %4, %9 |
| %else |
| AVX2_AbsDiffub %4, %8, %9, %8, %10 |
| %endif |
| %endmacro |
| |
| ; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 sqdiff=%5 sqsum_cur=%6 pCur=%7 pRef=%8 zero=%9 tmp=%10,%11,%12 accumulate_results=%13 |
| %macro AVX2_SadBgdSqdiff 13 |
| %ifidn %12, 0 |
| vmovdqu %10, [%7] |
| AVX2_Sumbdw %2, %10, %9, %11, %13 |
| AVX2_Sqsumbdw %6, %10, %9, %11, %13 |
| vmovdqu %11, [%8] |
| AVX2_Sadbdw %1, %10, %11, %10, %13 |
| AVX2_Sumbdw %3, %11, %9, %10, %13 |
| vmovdqu %10, [%7] |
| %if %13 |
| AVX2_AbsDiffub %11, %10, %11, [%7], %10 |
| vpmaxub %4, %4, %11 |
| AVX2_Sqsumbdw %5, %11, %9, %10, %13 |
| %else |
| AVX2_AbsDiffub %4, %10, %11, [%7], %10 |
| AVX2_Sqsumbdw %5, %4, %9, %10, %13 |
| %endif |
| %else |
| vmovdqu %10, [%7] |
| vmovdqu %11, [%8] |
| AVX2_Sadbdw %1, %10, %11, %12, %13 |
| AVX2_Sumbdw %2, %10, %9, %12, %13 |
| AVX2_Sumbdw %3, %11, %9, %12, %13 |
| AVX2_Sqsumbdw %6, %10, %9, %12, %13 |
| %if %13 |
| AVX2_AbsDiffub %11, %10, %11, %10, %12 |
| vpmaxub %4, %4, %11 |
| AVX2_Sqsumbdw %5, %11, %9, %10, %13 |
| %else |
| AVX2_AbsDiffub %4, %10, %11, %10, %12 |
| AVX2_Sqsumbdw %5, %4, %9, %10, %13 |
| %endif |
| %endif |
| %endmacro |
| |
| ; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5 |
| %macro AVX2_Store8x8Accdw 5 |
| vpshufd %2%4, %2%3, 1000b |
| %ifidni %2, x |
| vmovlps [%1 + 8 * %5], x%4 |
| %elif %5 == 0 |
| vmovdqu [%1], %2%4 |
| %else |
| vmovlps [%1 + 8], x%4 |
| vextracti128 x%4, %2%4, 1 |
| vmovlps [%1 + 24], x%4 |
| %endif |
| %endmacro |
| |
| ; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5 |
| %macro AVX2_Store8x8Accb 5 |
| vpunpckhqdq %2%4, %2%3, %2%3 |
| vpunpcklbw %2%4, %2%3, %2%4 |
| %if %5 == 0 |
| vmovd [%1 + 0], x%4 |
| %ifidni %2, y |
| vextracti128 x%4, %2%4, 1 |
| vmovd [%1 + 4], x%4 |
| %endif |
| %else |
| vpextrw [%1 + 2], x%4, 0 |
| %ifidni %2, y |
| vextracti128 x%4, %2%4, 1 |
| vpextrw [%1 + 6], x%4, 0 |
| %endif |
| %endif |
| %endmacro |
| |
| ; p_dst=%1 data=%2 tmp=%3,%4 second_blocks=%5 |
| %macro AVX2_Store2x8x8Accb 5 |
| vpunpckhqdq y%3, y%2, y%2 |
| vpunpcklbw y%3, y%2, y%3 |
| vextracti128 x%4, y%3, 1 |
| vpsllq x%4, x%4, 32 |
| vpblendd x%4, x%3, x%4, 1010b |
| %if %5 |
| vpslld x%4, x%4, 16 |
| vpblendw x%4, x%4, [%1], 01010101b |
| %endif |
| vmovdqu [%1], x%4 |
| %endmacro |
| |
| ; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 add_to_dst=%5 |
| %macro AVX2_Store16x16Accdw 5 |
| %ifidni %2, x |
| %if %5 |
| vmovd x%4, [%1 + 0] |
| vpaddd x%3, x%4, x%3 |
| %endif |
| vmovd [%1 + 0], x%3 |
| %elif %5 == 0 |
| vmovd [%1 + 0], x%3 |
| vextracti128 x%3, %2%3, 1 |
| vmovd [%1 + 4], x%3 |
| %else |
| vextracti128 x%4, %2%3, 1 |
| vpunpckldq x%4, x%3, x%4 |
| vmovq x%3, [%1 + 0] |
| vpaddd x%3, x%3, x%4 |
| vmovlps [%1 + 0], x%3 |
| %endif |
| %endmacro |
| |
| ; p_dst1=%1 p_dst2=%2 i_dst_offset=%3 gpr_tmp=%4 mmreg_prefix=%5 data=%6 mm_tmp=%7 add_to_dst=%8 |
| %macro AVX2_Store2x16x16Accdw 8 |
| %ifidni %5, x |
| mov %4, %1 |
| %if %8 == 0 |
| vmovd [%4 + %3], x%6 |
| mov %4, %2 |
| vpextrd [%4 + %3], x%6, 2 |
| %else |
| vmovd x%7, [%4 + %3] |
| vpaddd x%7, x%7, x%6 |
| vmovd [%4 + %3], x%7 |
| mov %4, %2 |
| vpbroadcastd x%7, [%4 + %3] |
| vpaddd x%7, x%7, x%6 |
| vpextrd [%4 + %3], x%7, 2 |
| %endif |
| %else |
| vextracti128 x%7, %5%6, 1 |
| vpblendd x%6, x%6, x%7, 1010b |
| mov %4, %1 |
| %if %8 == 0 |
| vmovlps [%4 + %3], x%6 |
| mov %4, %2 |
| vmovhps [%4 + %3], x%6 |
| %else |
| vmovq x%7, [%4 + %3] |
| vpaddd x%7, x%7, x%6 |
| vmovlps [%4 + %3], x%7 |
| mov %4, %2 |
| vpbroadcastq x%7, [%4 + %3] |
| vpaddd x%7, x%7, x%6 |
| vmovhps [%4 + %3], x%7 |
| %endif |
| %endif |
| %endmacro |
| |
| |
| ; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7 |
| %macro AVX2_CalcSad_8Lines 7 |
| %define mm_tmp0 %2 |
| %define mm_sad %3 |
| %define mm_sad2 %4 |
| %define mm_sad3 %5 |
| %define mm_sad4 %6 |
| %define b_second_blocks %7 |
| %ifdef i_stride5 |
| %define i_stride5_ i_stride5 |
| %else |
| lea r_tmp, [5 * i_stride] |
| %define i_stride5_ r_tmp |
| %endif |
| ; Use multiple accumulators to shorten dependency chains and enable more parallelism. |
| AVX2_Sad %1 %+ mm_sad, p_cur, p_ref, %1 %+ mm_tmp0, 0 |
| AVX2_Sad %1 %+ mm_sad2, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_tmp0, 0 |
| AVX2_Sad %1 %+ mm_sad3, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_tmp0, 0 |
| AVX2_Sad %1 %+ mm_sad4, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_tmp0, 0 |
| AVX2_Sad %1 %+ mm_sad, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_tmp0, 1 |
| AVX2_Sad %1 %+ mm_sad2, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_tmp0, 1 |
| %ifdef i_stride7 |
| %define i_stride7_ i_stride7 |
| %else |
| lea r_tmp, [i_stride + 2 * i_stride3] |
| %define i_stride7_ r_tmp |
| %endif |
| AVX2_Sad %1 %+ mm_sad3, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_tmp0, 1 |
| AVX2_Sad %1 %+ mm_sad4, p_cur + 1 * i_stride7_, p_ref + 1 * i_stride7_, %1 %+ mm_tmp0, 1 |
| %undef i_stride5_ |
| %undef i_stride7_ |
| ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. |
| add p_cur, %1 %+ mm_width |
| add p_ref, %1 %+ mm_width |
| ; Collapse accumulators. |
| vpaddd %1 %+ mm_sad, %1 %+ mm_sad, %1 %+ mm_sad2 |
| vpaddd %1 %+ mm_sad3, %1 %+ mm_sad3, %1 %+ mm_sad4 |
| vpaddd %1 %+ mm_sad, %1 %+ mm_sad, %1 %+ mm_sad3 |
| AVX2_Store8x8Accdw p_sad8x8 + xcnt_unit * i_xcnt, %1, mm_sad, mm_tmp0, b_second_blocks |
| vpaddd y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad |
| %undef mm_tmp0 |
| %undef mm_sad |
| %undef mm_sad2 |
| %undef mm_sad3 |
| %undef mm_sad4 |
| %undef b_second_blocks |
| %endmacro |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSad_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8) |
| ;************************************************************************************************************* |
| |
| WELS_EXTERN VAACalcSad_avx2 |
| %define p_sadframe ptrword arg6 |
| %define p_sad8x8 ptrword arg7 |
| %ifdef X86_32 |
| %define saveregs r5, r6 |
| %else |
| %define saveregs rbx, rbp, r12 |
| %endif |
| |
| %assign push_num 0 |
| LOAD_5_PARA |
| PUSH_XMM 7 |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| PUSHM saveregs |
| |
| %define mm_zero mm0 |
| %define mm_sadframe mm6 |
| vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero |
| vmovdqa y %+ mm_sadframe, y %+ mm_zero |
| |
| and r2, -16 ; iPicWidth &= -16 |
| jle .done ; bail if iPicWidth < 16 |
| sar r3, 4 ; iPicHeight / 16 |
| jle .done ; bail if iPicHeight < 16 |
| shr r2, 2 ; iPicWidth / 4 |
| |
| %define p_cur r0 |
| %define p_ref r1 |
| %define i_xcnt r2 |
| %define i_ycnt ptrword arg4 |
| %define i_stride r4 |
| %define xcnt_unit 4 |
| %ifdef X86_32 |
| mov i_ycnt, r3 |
| mov r5, p_sad8x8 |
| %define i_stride3 r3 |
| %undef p_sad8x8 |
| %define p_sad8x8 r5 |
| %define r_tmp r6 |
| lea i_stride3, [3 * i_stride] |
| %else |
| mov rbp, p_sad8x8 |
| %define i_stride3 rbx |
| %define i_stride5 r12 |
| %define i_stride7 r6 |
| %undef p_sad8x8 |
| %define p_sad8x8 rbp |
| lea i_stride3, [3 * i_stride] |
| lea i_stride5, [5 * i_stride] |
| lea i_stride7, [i_stride + 2 * i_stride3] |
| %endif |
| |
| ; offset pointer so as to compensate for the i_xcnt offset below. |
| sub p_sad8x8, 4 * 16 / xcnt_unit |
| |
| push i_xcnt |
| %assign push_num push_num + 1 |
| %define i_xcnt_load ptrword [r7] |
| |
| .height_loop: |
| ; use end-of-line pointers so as to enable use of a negative counter as index. |
| lea p_sad8x8, [p_sad8x8 + xcnt_unit * i_xcnt] |
| ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter. |
| neg i_xcnt |
| add i_xcnt, 16 / xcnt_unit |
| jz .width_loop_upper8_remaining16 |
| .width_loop_upper8: |
| AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 0 |
| add i_xcnt, 32 / xcnt_unit |
| jl .width_loop_upper8 |
| jg .width_loop_upper8_end |
| .width_loop_upper8_remaining16: |
| AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 0 |
| .width_loop_upper8_end: |
| lea p_cur, [p_cur + 8 * i_stride] |
| lea p_ref, [p_ref + 8 * i_stride] |
| xor i_xcnt, i_xcnt |
| sub i_xcnt, i_xcnt_load |
| lea p_cur, [p_cur + xcnt_unit * i_xcnt] |
| lea p_ref, [p_ref + xcnt_unit * i_xcnt] |
| add i_xcnt, 16 / xcnt_unit |
| jz .width_loop_lower8_remaining16 |
| .width_loop_lower8: |
| AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 1 |
| add i_xcnt, 32 / xcnt_unit |
| jl .width_loop_lower8 |
| jg .width_loop_lower8_end |
| .width_loop_lower8_remaining16: |
| AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 1 |
| .width_loop_lower8_end: |
| lea p_cur, [p_cur + 8 * i_stride] |
| lea p_ref, [p_ref + 8 * i_stride] |
| xor i_xcnt, i_xcnt |
| sub i_xcnt, i_xcnt_load |
| lea p_cur, [p_cur + xcnt_unit * i_xcnt] |
| lea p_ref, [p_ref + xcnt_unit * i_xcnt] |
| neg i_xcnt |
| sub i_ycnt, 1 |
| jnz .height_loop |
| |
| pop i_xcnt |
| %assign push_num push_num - 1 |
| %undef i_xcnt_load |
| |
| .done: |
| mov r6, p_sadframe |
| vextracti128 xmm2, y %+ mm_sadframe, 1 |
| vpaddd xmm2, x %+ mm_sadframe, xmm2 |
| vpunpckhqdq xmm1, xmm2, xmm2 |
| vpaddd xmm2, xmm2, xmm1 |
| vmovd [r6], xmm2 |
| vzeroupper |
| |
| POPM saveregs |
| POP_XMM |
| LOAD_5_PARA_POP |
| %undef p_cur |
| %undef p_ref |
| %undef i_xcnt |
| %undef i_ycnt |
| %undef i_stride |
| %undef r_tmp |
| %undef xcnt_unit |
| %undef i_stride3 |
| %undef i_stride5 |
| %undef i_stride7 |
| %undef mm_sadframe |
| %undef mm_zero |
| %undef saveregs |
| %undef p_sadframe |
| %undef p_sad8x8 |
| ret |
| |
| |
| ; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7 |
| %macro AVX2_CalcSadVar_8Lines 7 |
| %define mm_tmp0 %2 |
| %define mm_tmp1 %3 |
| %define mm_sad %4 |
| %define mm_sum %5 |
| %define mm_sqsum %6 |
| %define b_second_blocks %7 |
| ; Unroll for better performance on Haswell. |
| ; Avoid unrolling for the 16 px case so as to reduce the code footprint. |
| %ifidni %1, y |
| lea r_tmp, [5 * i_stride] |
| AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 0 |
| AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 |
| AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 |
| AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 |
| AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 |
| AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 |
| lea r_tmp, [i_stride + 2 * i_stride3] |
| AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 |
| AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 |
| ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. |
| add p_cur, %1 %+ mm_width |
| add p_ref, %1 %+ mm_width |
| %else |
| vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad |
| vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum |
| vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum |
| lea r_tmp, [8 * i_stride] |
| add p_cur, r_tmp |
| add p_ref, r_tmp |
| neg r_tmp |
| %%loop: |
| AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 |
| add r_tmp, i_stride |
| jl %%loop |
| ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. |
| lea r_tmp, [8 * i_stride - %1 %+ mm_width] |
| sub p_cur, r_tmp |
| sub p_ref, r_tmp |
| %endif |
| AVX2_Store8x8Accdw p_sad8x8 + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks |
| vpaddd y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad |
| vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_sqsum |
| vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sqsum |
| vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 |
| vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b |
| vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 |
| AVX2_Store2x16x16Accdw p_sum16x16, p_sqsum16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks |
| %undef mm_tmp0 |
| %undef mm_tmp1 |
| %undef mm_sad |
| %undef mm_sum |
| %undef mm_sqsum |
| %undef b_second_blocks |
| %endmacro |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSadVar_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16) |
| ;************************************************************************************************************* |
| |
| WELS_EXTERN VAACalcSadVar_avx2 |
| %define p_sadframe ptrword arg6 |
| %define p_sad8x8 ptrword arg7 |
| %define p_sum16x16 ptrword arg8 |
| %define p_sqsum16x16 ptrword arg9 |
| %ifdef X86_32 |
| %define saveregs r5, r6 |
| %else |
| %define saveregs rbx, rbp, r12, r13 |
| %endif |
| |
| %assign push_num 0 |
| LOAD_5_PARA |
| PUSH_XMM 7 |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| PUSHM saveregs |
| |
| %define mm_zero mm0 |
| %define mm_sadframe mm6 |
| vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero |
| vmovdqa y %+ mm_sadframe, y %+ mm_zero |
| |
| and r2, -16 ; iPicWidth &= -16 |
| jle .done ; bail if iPicWidth < 16 |
| sar r3, 4 ; iPicHeight / 16 |
| jle .done ; bail if iPicHeight < 16 |
| shr r2, 2 ; iPicWidth / 4 |
| |
| %define p_cur r0 |
| %define p_ref r1 |
| %define i_xcnt r2 |
| %define i_ycnt ptrword arg4 |
| %define i_stride r4 |
| %define r_tmp r6 |
| %define xcnt_unit 4 |
| %ifdef X86_32 |
| mov i_ycnt, r3 |
| mov r3, p_sad8x8 |
| %undef p_sad8x8 |
| %define p_sad8x8 r3 |
| %define i_stride3 r5 |
| %else |
| mov rbp, p_sad8x8 |
| mov r12, p_sum16x16 |
| mov r13, p_sqsum16x16 |
| %undef p_sad8x8 |
| %undef p_sum16x16 |
| %undef p_sqsum16x16 |
| %define p_sad8x8 rbp |
| %define p_sum16x16 r12 |
| %define p_sqsum16x16 r13 |
| %define i_stride3 rbx |
| %endif |
| lea i_stride3, [3 * i_stride] |
| |
| ; offset pointers so as to compensate for the i_xcnt offset below. |
| sub p_sad8x8, 4 * 16 / xcnt_unit |
| sub p_sum16x16, 1 * 16 / xcnt_unit |
| sub p_sqsum16x16, 1 * 16 / xcnt_unit |
| |
| ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter. |
| neg i_xcnt |
| |
| .height_loop: |
| push i_xcnt |
| %assign push_num push_num + 1 |
| %define i_xcnt_load ptrword [r7] |
| ; use end-of-line pointers so as to enable use of a negative counter as index. |
| lea r_tmp, [xcnt_unit * i_xcnt] |
| sub p_sad8x8, r_tmp |
| sub p_sum16x16, i_xcnt |
| sub p_sqsum16x16, i_xcnt |
| add i_xcnt, 16 / xcnt_unit |
| jz .width_loop_upper8_remaining16 |
| .width_loop_upper8: |
| AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 0 |
| add i_xcnt, 32 / xcnt_unit |
| jl .width_loop_upper8 |
| jg .width_loop_upper8_end |
| .width_loop_upper8_remaining16: |
| AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 0 |
| .width_loop_upper8_end: |
| lea p_cur, [p_cur + 8 * i_stride] |
| lea p_ref, [p_ref + 8 * i_stride] |
| mov i_xcnt, i_xcnt_load |
| lea p_cur, [p_cur + xcnt_unit * i_xcnt] |
| lea p_ref, [p_ref + xcnt_unit * i_xcnt] |
| add i_xcnt, 16 / xcnt_unit |
| jz .width_loop_lower8_remaining16 |
| .width_loop_lower8: |
| AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 1 |
| add i_xcnt, 32 / xcnt_unit |
| jl .width_loop_lower8 |
| jg .width_loop_lower8_end |
| .width_loop_lower8_remaining16: |
| AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 1 |
| .width_loop_lower8_end: |
| lea p_cur, [p_cur + 8 * i_stride] |
| lea p_ref, [p_ref + 8 * i_stride] |
| %undef i_xcnt_load |
| pop i_xcnt |
| %assign push_num push_num - 1 |
| lea p_cur, [p_cur + xcnt_unit * i_xcnt] |
| lea p_ref, [p_ref + xcnt_unit * i_xcnt] |
| sub i_ycnt, 1 |
| jnz .height_loop |
| |
| .done: |
| mov r_tmp, p_sadframe |
| vextracti128 xmm2, y %+ mm_sadframe, 1 |
| vpaddd xmm2, x %+ mm_sadframe, xmm2 |
| vpunpckhqdq xmm1, xmm2, xmm2 |
| vpaddd xmm2, xmm2, xmm1 |
| vmovd [r_tmp], xmm2 |
| vzeroupper |
| |
| POPM saveregs |
| POP_XMM |
| LOAD_5_PARA_POP |
| %undef p_cur |
| %undef p_ref |
| %undef i_xcnt |
| %undef i_ycnt |
| %undef i_stride |
| %undef i_stride3 |
| %undef r_tmp |
| %undef xcnt_unit |
| %undef mm_sadframe |
| %undef mm_zero |
| %undef saveregs |
| %undef p_sadframe |
| %undef p_sad8x8 |
| %undef p_sum16x16 |
| %undef p_sqsum16x16 |
| ret |
| |
| |
| ; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9 |
| %macro AVX2_CalcSadSsd_8Lines 9 |
| %define mm_tmp0 %2 |
| %define mm_tmp1 %3 |
| %define mm_tmp2 %4 |
| %define mm_sad %5 |
| %define mm_sum %6 |
| %define mm_sqsum %7 |
| %define mm_sqdiff %8 |
| %define b_second_blocks %9 |
| ; Unroll for better performance on Haswell. |
| ; Avoid unrolling for the 16 px case so as to reduce the code footprint. |
| %ifidni %1, y |
| %ifdef i_stride5 |
| lea r_tmp, [i_stride + 2 * i_stride3] |
| %define i_stride5_ i_stride5 |
| %else |
| lea r_tmp, [5 * i_stride] |
| %define i_stride5_ r_tmp |
| %endif |
| AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0 |
| AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| %ifndef i_stride5 |
| lea r_tmp, [i_stride + 2 * i_stride3] |
| %endif |
| %undef i_stride5_ |
| AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. |
| add p_cur, %1 %+ mm_width |
| add p_ref, %1 %+ mm_width |
| %else |
| vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad |
| vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum |
| vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum |
| vpxor x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff |
| lea r_tmp, [8 * i_stride] |
| add p_cur, r_tmp |
| add p_ref, r_tmp |
| neg r_tmp |
| %%loop: |
| AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| add r_tmp, i_stride |
| jl %%loop |
| ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. |
| lea r_tmp, [8 * i_stride - %1 %+ mm_width] |
| sub p_cur, r_tmp |
| sub p_ref, r_tmp |
| %endif |
| mov r_tmp, p_sad8x8 |
| AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks |
| %ifdef X86_32 |
| vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc |
| vmovdqa sadframe_acc, y %+ mm_tmp1 |
| %else |
| vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad |
| %endif |
| mov r_tmp, i_xcnt |
| add r_tmp, p_sum16x16 |
| vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum |
| vpaddd %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1 |
| AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks |
| vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff |
| vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff |
| vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 |
| vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b |
| vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 |
| AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks |
| %undef mm_tmp0 |
| %undef mm_tmp1 |
| %undef mm_tmp2 |
| %undef mm_sad |
| %undef mm_sum |
| %undef mm_sqsum |
| %undef mm_sqdiff |
| %undef b_second_blocks |
| %endmacro |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSadSsd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, |
| ; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16) |
| ;************************************************************************************************************* |
| |
| WELS_EXTERN VAACalcSadSsd_avx2 |
| %define p_sadframe ptrword arg6 |
| %define p_sad8x8 ptrword arg7 |
| %define p_sum16x16 ptrword arg8 |
| %define p_sqsum16x16 ptrword arg9 |
| %define p_sqdiff16x16 ptrword arg10 |
| %ifdef X86_32 |
| %define saveregs r5, r6 |
| %else |
| %define saveregs rbx, rbp, r12, r13, r14, r15 |
| %endif |
| |
| %assign push_num 0 |
| LOAD_5_PARA |
| PUSH_XMM 9 |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| PUSHM saveregs |
| |
| %define mm_zero mm0 |
| vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero |
| |
| %ifdef X86_32 |
| STACK_ALLOC r5, ymm_width, ymm_width |
| %define sadframe_acc_addr r5 |
| %define sadframe_acc [sadframe_acc_addr] |
| %else |
| %define sadframe_acc ymm8 |
| %define xsadframe_acc xmm8 |
| %endif |
| vmovdqa sadframe_acc, y %+ mm_zero |
| |
| and r2, -16 ; iPicWidth &= -16 |
| jle .done ; bail if iPicWidth < 16 |
| sar r3, 4 ; iPicHeight / 16 |
| jle .done ; bail if iPicHeight < 16 |
| shr r2, 2 ; iPicWidth / 4 |
| |
| %define p_cur r0 |
| %define p_ref r1 |
| %define i_xcnt r2 |
| %define i_ycnt ptrword arg4 |
| %define i_stride r4 |
| %define r_tmp r6 |
| %define xcnt_unit 4 |
| %ifdef X86_32 |
| mov i_ycnt, r3 |
| %define i_stride3 r3 |
| %else |
| mov r12, p_sad8x8 |
| mov r13, p_sum16x16 |
| mov r14, p_sqsum16x16 |
| mov r15, p_sqdiff16x16 |
| %undef p_sad8x8 |
| %undef p_sum16x16 |
| %undef p_sqsum16x16 |
| %undef p_sqdiff16x16 |
| %define p_sad8x8 r12 |
| %define p_sum16x16 r13 |
| %define p_sqsum16x16 r14 |
| %define p_sqdiff16x16 r15 |
| %define i_stride3 rbx |
| %define i_stride5 rbp |
| lea i_stride5, [5 * i_stride] |
| %endif |
| lea i_stride3, [3 * i_stride] |
| |
| ; offset pointers so as to compensate for i_xcnt offset below. |
| sub p_sad8x8, 4 * 16 / xcnt_unit |
| sub p_sum16x16, 1 * 16 / xcnt_unit |
| sub p_sqsum16x16, 1 * 16 / xcnt_unit |
| sub p_sqdiff16x16, 1 * 16 / xcnt_unit |
| |
| ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter. |
| neg i_xcnt |
| |
| .height_loop: |
| push i_xcnt |
| %assign push_num push_num + 1 |
| %define i_xcnt_load ptrword [r7] |
| ; use end-of-line pointers so as to enable use of a negative counter as index. |
| lea r_tmp, [xcnt_unit * i_xcnt] |
| sub p_sad8x8, r_tmp |
| sub p_sum16x16, i_xcnt |
| sub p_sqsum16x16, i_xcnt |
| sub p_sqdiff16x16, i_xcnt |
| add i_xcnt, 16 / xcnt_unit |
| jz .width_loop_upper8_remaining16 |
| .width_loop_upper8: |
| AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0 |
| add i_xcnt, 32 / xcnt_unit |
| jl .width_loop_upper8 |
| jg .width_loop_upper8_end |
| .width_loop_upper8_remaining16: |
| AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0 |
| .width_loop_upper8_end: |
| lea p_cur, [p_cur + 8 * i_stride] |
| lea p_ref, [p_ref + 8 * i_stride] |
| mov i_xcnt, i_xcnt_load |
| lea p_cur, [p_cur + xcnt_unit * i_xcnt] |
| lea p_ref, [p_ref + xcnt_unit * i_xcnt] |
| add i_xcnt, 16 / xcnt_unit |
| jz .width_loop_lower8_remaining16 |
| .width_loop_lower8: |
| AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1 |
| add i_xcnt, 32 / xcnt_unit |
| jl .width_loop_lower8 |
| jg .width_loop_lower8_end |
| .width_loop_lower8_remaining16: |
| AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1 |
| .width_loop_lower8_end: |
| lea p_cur, [p_cur + 8 * i_stride] |
| lea p_ref, [p_ref + 8 * i_stride] |
| %undef i_xcnt_load |
| pop i_xcnt |
| %assign push_num push_num - 1 |
| lea p_cur, [p_cur + xcnt_unit * i_xcnt] |
| lea p_ref, [p_ref + xcnt_unit * i_xcnt] |
| sub i_ycnt, 1 |
| jnz .height_loop |
| |
| .done: |
| mov r_tmp, p_sadframe |
| %ifdef X86_32 |
| vmovdqa xmm2, sadframe_acc |
| vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width] |
| %else |
| vextracti128 xmm2, sadframe_acc, 1 |
| vpaddd xmm2, xsadframe_acc, xmm2 |
| %endif |
| vpunpckhqdq xmm1, xmm2, xmm2 |
| vpaddd xmm2, xmm2, xmm1 |
| vmovd [r_tmp], xmm2 |
| vzeroupper |
| %ifdef X86_32 |
| STACK_DEALLOC |
| %endif |
| POPM saveregs |
| POP_XMM |
| LOAD_5_PARA_POP |
| %undef p_cur |
| %undef p_ref |
| %undef i_xcnt |
| %undef i_ycnt |
| %undef i_stride |
| %undef i_stride3 |
| %undef i_stride5 |
| %undef r_tmp |
| %undef xcnt_unit |
| %undef sadframe_acc |
| %undef sadframe_acc_addr |
| %undef xsadframe_acc |
| %undef mm_zero |
| %undef saveregs |
| %undef p_sadframe |
| %undef p_sad8x8 |
| %undef p_sum16x16 |
| %undef p_sqsum16x16 |
| %undef p_sqdiff16x16 |
| ret |
| |
| |
| ; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9 |
| %macro AVX2_CalcSadBgd_8Lines 9 |
| %define mm_tmp0 %2 |
| %define mm_tmp1 %3 |
| %define mm_tmp2 %8 |
| %define mm_mad %4 |
| %define mm_sumcur %5 |
| %define mm_sumref %6 |
| %define mm_sad %7 |
| %define b_second_blocks %9 |
| ; Unroll for better performance on Haswell. |
| ; Avoid unrolling for the 16 px case so as to reduce the code footprint. |
| %ifidni %1, y |
| lea r_tmp, [5 * i_stride] |
| AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0 |
| AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| lea r_tmp, [i_stride + 2 * i_stride3] |
| AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. |
| add p_cur, %1 %+ mm_width |
| add p_ref, %1 %+ mm_width |
| %else |
| vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad |
| vpxor x %+ mm_sumcur, x %+ mm_sumcur, x %+ mm_sumcur |
| vpxor x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref |
| vpxor x %+ mm_mad, x %+ mm_mad, x %+ mm_mad |
| lea r_tmp, [8 * i_stride] |
| add p_cur, r_tmp |
| add p_ref, r_tmp |
| neg r_tmp |
| %%loop: |
| AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 |
| add r_tmp, i_stride |
| jl %%loop |
| ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. |
| lea r_tmp, [8 * i_stride - %1 %+ mm_width] |
| sub p_cur, r_tmp |
| sub p_ref, r_tmp |
| %endif |
| mov r_tmp, p_sad8x8 |
| AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks |
| %ifdef X86_32 |
| vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc |
| vmovdqa sadframe_acc, y %+ mm_tmp1 |
| %else |
| vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad |
| %endif |
| mov r_tmp, p_sd8x8 |
| vpsubd %1 %+ mm_tmp0, %1 %+ mm_sumcur, %1 %+ mm_sumref |
| AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_tmp0, mm_tmp1, b_second_blocks |
| ; Coalesce store and horizontal reduction of MAD accumulator for even and |
| ; odd iterations so as to enable more parallelism. |
| %ifidni %1, y |
| test i_xcnt, 32 / xcnt_unit |
| jz %%preserve_mad |
| mov r_tmp, p_mad8x8 |
| AVX2_Maxubq2 y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0 |
| AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks |
| %%preserve_mad: |
| vmovdqa prev_mad, y %+ mm_mad |
| %else |
| mov r_tmp, p_mad8x8 |
| AVX2_Maxubq %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0 |
| AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks |
| %endif |
| %undef mm_tmp0 |
| %undef mm_tmp1 |
| %undef mm_tmp2 |
| %undef mm_mad |
| %undef mm_sumcur |
| %undef mm_sumref |
| %undef mm_sad |
| %undef b_second_blocks |
| %endmacro |
| |
| ; Store remaining MAD accumulator for width & 32 cases. |
| ; width/xcnt_unit=%1 mm_tmp=%2,%3 b_second_blocks=%4 |
| %macro AVX2_StoreRemainingSingleMad 4 |
| test %1, 32 / xcnt_unit |
| jz %%skip |
| mov r_tmp, p_mad8x8 |
| vmovdqa y%2, prev_mad |
| AVX2_Maxubq y%2, y%2, y%3 |
| AVX2_Store8x8Accb r_tmp + i_xcnt - 8, y, %2, %3, %4 |
| %%skip: |
| %endmacro |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSadBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8) |
| ;************************************************************************************************************* |
| |
| WELS_EXTERN VAACalcSadBgd_avx2 |
| %define p_sadframe arg6 |
| %define p_sad8x8 arg7 |
| %define p_sd8x8 arg8 |
| %define p_mad8x8 arg9 |
| %ifdef X86_32 |
| %define saveregs r5, r6 |
| %else |
| %define saveregs rbx, rbp, r12, r13 |
| %endif |
| |
| %assign push_num 0 |
| LOAD_5_PARA |
| PUSH_XMM 10 |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| PUSHM saveregs |
| |
| %define mm_zero mm0 |
| vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero |
| |
| %ifdef X86_32 |
| STACK_ALLOC r5, 2 * ymm_width, ymm_width |
| %define sadframe_acc_addr r5 |
| %define sadframe_acc [sadframe_acc_addr] |
| %define prev_mad [r5 + ymm_width] |
| %else |
| %define sadframe_acc ymm8 |
| %define xsadframe_acc xmm8 |
| %define prev_mad ymm9 |
| %endif |
| vmovdqa sadframe_acc, y %+ mm_zero |
| |
| and r2, -16 ; iPicWidth &= -16 |
| jle .done ; bail if iPicWidth < 16 |
| sar r3, 4 ; iPicHeight / 16 |
| jle .done ; bail if iPicHeight < 16 |
| shr r2, 2 ; iPicWidth / 4 |
| |
| %define p_cur r0 |
| %define p_ref r1 |
| %define i_xcnt r2 |
| %define i_ycnt ptrword arg4 |
| %define i_stride r4 |
| %define r_tmp r6 |
| %define xcnt_unit 4 |
| %ifdef X86_32 |
| mov i_ycnt, r3 |
| %define i_stride3 r3 |
| %else |
| mov rbp, p_sad8x8 |
| mov r12, p_sd8x8 |
| mov r13, p_mad8x8 |
| %undef p_sad8x8 |
| %undef p_sd8x8 |
| %undef p_mad8x8 |
| %define p_sad8x8 rbp |
| %define p_sd8x8 r12 |
| %define p_mad8x8 r13 |
| %define i_stride3 rbx |
| %endif |
| lea i_stride3, [3 * i_stride] |
| |
| ; offset pointers to compensate for the i_xcnt offset below. |
| mov r_tmp, i_xcnt |
| and r_tmp, 64 / xcnt_unit - 1 |
| sub p_mad8x8, r_tmp |
| shl r_tmp, 2 |
| sub p_sad8x8, r_tmp |
| sub p_sd8x8, r_tmp |
| |
| .height_loop: |
| push i_xcnt |
| %assign push_num push_num + 1 |
| %define i_xcnt_load ptrword [r7] |
| ; use end-of-line pointers so as to enable use of a negative counter as index. |
| lea r_tmp, [xcnt_unit * i_xcnt] |
| add p_sad8x8, r_tmp |
| add p_sd8x8, r_tmp |
| add p_mad8x8, i_xcnt |
| and i_xcnt, -(64 / xcnt_unit) |
| jz .width_loop_upper8_64x_end |
| ; use a negative loop counter to enable counting toward zero and indexing with the same counter. |
| neg i_xcnt |
| .width_loop_upper8: |
| AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0 |
| add i_xcnt, 32 / xcnt_unit |
| jl .width_loop_upper8 |
| jg .width_loop_upper8_32x_end |
| .width_loop_upper8_64x_end: |
| test i_xcnt_load, 32 / xcnt_unit |
| jnz .width_loop_upper8 |
| .width_loop_upper8_32x_end: |
| AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0 |
| test i_xcnt_load, 16 / xcnt_unit |
| jz .width_loop_upper8_end |
| ; remaining 16. |
| AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0 |
| .width_loop_upper8_end: |
| lea p_cur, [p_cur + 8 * i_stride] |
| lea p_ref, [p_ref + 8 * i_stride] |
| mov i_xcnt, i_xcnt_load |
| lea r_tmp, [xcnt_unit * i_xcnt] |
| sub p_cur, r_tmp |
| sub p_ref, r_tmp |
| and i_xcnt, -(64 / xcnt_unit) |
| jz .width_loop_lower8_64x_end |
| neg i_xcnt |
| .width_loop_lower8: |
| AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1 |
| add i_xcnt, 32 / xcnt_unit |
| jl .width_loop_lower8 |
| jg .width_loop_lower8_32x_end |
| .width_loop_lower8_64x_end: |
| test i_xcnt_load, 32 / xcnt_unit |
| jnz .width_loop_lower8 |
| .width_loop_lower8_32x_end: |
| AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1 |
| test i_xcnt_load, 16 / xcnt_unit |
| jz .width_loop_lower8_end |
| ; remaining 16. |
| AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1 |
| .width_loop_lower8_end: |
| lea p_cur, [p_cur + 8 * i_stride] |
| lea p_ref, [p_ref + 8 * i_stride] |
| pop i_xcnt |
| %undef i_xcnt_load |
| %assign push_num push_num - 1 |
| lea r_tmp, [xcnt_unit * i_xcnt] |
| sub p_cur, r_tmp |
| sub p_ref, r_tmp |
| sub i_ycnt, 1 |
| jnz .height_loop |
| |
| .done: |
| mov r_tmp, p_sadframe |
| %ifdef X86_32 |
| vmovdqa xmm2, sadframe_acc |
| vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width] |
| %else |
| vextracti128 xmm2, sadframe_acc, 1 |
| vpaddd xmm2, xsadframe_acc, xmm2 |
| %endif |
| vpunpckhqdq xmm1, xmm2, xmm2 |
| vpaddd xmm2, xmm2, xmm1 |
| vmovd [r_tmp], xmm2 |
| vzeroupper |
| %ifdef X86_32 |
| STACK_DEALLOC |
| %endif |
| POPM saveregs |
| POP_XMM |
| LOAD_5_PARA_POP |
| %undef p_cur |
| %undef p_ref |
| %undef i_xcnt |
| %undef i_ycnt |
| %undef i_stride |
| %undef i_stride3 |
| %undef r_tmp |
| %undef xcnt_unit |
| %undef sadframe_acc |
| %undef sadframe_acc_addr |
| %undef xsadframe_acc |
| %undef prev_mad |
| %undef mm_zero |
| %undef saveregs |
| %undef p_sadframe |
| %undef p_sad8x8 |
| %undef p_sd8x8 |
| %undef p_mad8x8 |
| ret |
| |
| |
| ; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8,%9,%10 b_second_blocks=%11 |
| %macro AVX2_CalcSadSsdBgd_8Lines 11 |
| %define mm_tmp0 %2 |
| %define mm_tmp1 %3 |
| %define mm_sad %4 |
| %define mm_sum %5 |
| %define mm_sumref %6 |
| %define mm_mad %7 |
| %define mm_sqsum %8 |
| %define mm_sqdiff %9 |
| %ifidn %10, 0 |
| %define tmp2 0 |
| %else |
| %define tmp2 %1 %+ %10 |
| %endif |
| %define b_second_blocks %11 |
| ; Unroll for better performance on Haswell. |
| ; Avoid unrolling for the 16 px case so as to reduce the code footprint. |
| %ifidni %1, y |
| lea r_tmp, [5 * i_stride] |
| AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 0 |
| AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 |
| AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 |
| AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 |
| AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 |
| AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 |
| lea r_tmp, [i_stride + 2 * i_stride3] |
| AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 |
| AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 |
| ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. |
| add p_cur, %1 %+ mm_width |
| add p_ref, %1 %+ mm_width |
| %else |
| vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad |
| vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum |
| vpxor x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref |
| vpxor x %+ mm_mad, x %+ mm_mad, x %+ mm_mad |
| vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum |
| vpxor x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff |
| lea r_tmp, [8 * i_stride] |
| add p_cur, r_tmp |
| add p_ref, r_tmp |
| neg r_tmp |
| %%loop: |
| AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 |
| add r_tmp, i_stride |
| jl %%loop |
| ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. |
| lea r_tmp, [8 * i_stride - %1 %+ mm_width] |
| sub p_cur, r_tmp |
| sub p_ref, r_tmp |
| %endif |
| mov r_tmp, p_sad8x8 |
| AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks |
| %ifdef X86_32 |
| vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc |
| vmovdqa sadframe_acc, y %+ mm_tmp1 |
| %else |
| vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad |
| %endif |
| mov r_tmp, i_xcnt |
| add r_tmp, p_sum16x16 |
| vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum |
| vpaddd %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1 |
| AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks |
| mov r_tmp, p_sd8x8 |
| vpsubd %1 %+ mm_sum, %1 %+ mm_sum, %1 %+ mm_sumref |
| AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sum, mm_tmp0, b_second_blocks |
| ; Coalesce store and horizontal reduction of MAD accumulator for even and |
| ; odd iterations so as to enable more parallelism. |
| %ifidni %1, y |
| test i_xcnt, 32 / xcnt_unit |
| jz %%preserve_mad |
| mov r_tmp, p_mad8x8 |
| AVX2_Maxubq2 y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0 |
| AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks |
| %%preserve_mad: |
| vmovdqa prev_mad, y %+ mm_mad |
| %else |
| mov r_tmp, p_mad8x8 |
| AVX2_Maxubq %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0 |
| AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks |
| %endif |
| vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff |
| vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff |
| vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 |
| vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b |
| vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 |
| AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks |
| %undef mm_tmp0 |
| %undef mm_tmp1 |
| %undef mm_sqsum |
| %undef mm_sqdiff |
| %undef mm_mad |
| %undef mm_sum |
| %undef mm_sumref |
| %undef mm_sad |
| %undef tmp2 |
| %undef b_second_blocks |
| %endmacro |
| |
| ;************************************************************************************************************* |
| ;void VAACalcSadSsdBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, |
| ; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, |
| ; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8) |
| ;************************************************************************************************************* |
| |
| WELS_EXTERN VAACalcSadSsdBgd_avx2 |
| %define p_sadframe arg6 |
| %define p_sad8x8 arg7 |
| %define p_sum16x16 arg8 |
| %define p_sqsum16x16 arg9 |
| %define p_sqdiff16x16 arg10 |
| %define p_sd8x8 arg11 |
| %define p_mad8x8 arg12 |
| %ifdef X86_32 |
| %define saveregs r5, r6 |
| %else |
| %define saveregs rbx, rbp, r12, r13, r14, r15 |
| %endif |
| |
| %assign push_num 0 |
| LOAD_5_PARA |
| PUSH_XMM 12 |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| PUSHM saveregs |
| |
| %ifdef X86_32 |
| STACK_ALLOC r5, 3 * ymm_width, ymm_width |
| %define mm8 0 |
| %define sadframe_acc_addr r5 |
| %define sadframe_acc [sadframe_acc_addr] |
| %define prev_mad [r5 + ymm_width] |
| %define ymm_zero [r5 + 2 * ymm_width] |
| %define xmm_zero ymm_zero |
| vpxor xmm0, xmm0, xmm0 |
| vmovdqa sadframe_acc, ymm0 |
| vmovdqa ymm_zero, ymm0 |
| %else |
| %define sadframe_acc ymm9 |
| %define xsadframe_acc xmm9 |
| %define prev_mad ymm10 |
| %define ymm_zero ymm11 |
| %define xmm_zero xmm11 |
| vpxor xmm_zero, xmm_zero, xmm_zero |
| vpxor xsadframe_acc, xsadframe_acc, xsadframe_acc |
| %endif |
| |
| and r2, -16 ; iPicWidth &= -16 |
| jle .done ; bail if iPicWidth < 16 |
| sar r3, 4 ; iPicHeight / 16 |
| jle .done ; bail if iPicHeight < 16 |
| shr r2, 2 ; iPicWidth / 4 |
| |
| %define p_cur r0 |
| %define p_ref r1 |
| %define i_xcnt r2 |
| %define i_ycnt ptrword arg4 |
| %define i_stride r4 |
| %define r_tmp r6 |
| %define xcnt_unit 4 |
| %ifdef X86_32 |
| mov i_ycnt, r3 |
| %define i_stride3 r3 |
| %else |
| mov rbp, p_sad8x8 |
| mov r12, p_sum16x16 |
| mov r13, p_sqsum16x16 |
| mov r14, p_sqdiff16x16 |
| mov r15, p_sd8x8 |
| %undef p_sad8x8 |
| %undef p_sum16x16 |
| %undef p_sqsum16x16 |
| %undef p_sqdiff16x16 |
| %undef p_sd8x8 |
| %define p_sad8x8 rbp |
| %define p_sum16x16 r12 |
| %define p_sqsum16x16 r13 |
| %define p_sqdiff16x16 r14 |
| %define p_sd8x8 r15 |
| %define i_stride3 rbx |
| %endif |
| lea i_stride3, [3 * i_stride] |
| |
| ; offset pointers so as to compensate for the i_xcnt offset below. |
| mov r_tmp, i_xcnt |
| and r_tmp, 64 / xcnt_unit - 1 |
| sub p_sum16x16, r_tmp |
| sub p_sqsum16x16, r_tmp |
| sub p_sqdiff16x16, r_tmp |
| sub p_mad8x8, r_tmp |
| shl r_tmp, 2 |
| sub p_sad8x8, r_tmp |
| sub p_sd8x8, r_tmp |
| |
| .height_loop: |
| push i_xcnt |
| %assign push_num push_num + 1 |
| %define i_xcnt_load ptrword [r7] |
| ; use end-of-line pointers so as to enable use of a negative counter as index. |
| lea r_tmp, [xcnt_unit * i_xcnt] |
| add p_sad8x8, r_tmp |
| add p_sum16x16, i_xcnt |
| add p_sqsum16x16, i_xcnt |
| add p_sqdiff16x16, i_xcnt |
| add p_sd8x8, r_tmp |
| add p_mad8x8, i_xcnt |
| and i_xcnt, -(64 / xcnt_unit) |
| jz .width_loop_upper8_64x_end |
| ; use a negative loop counter to enable counting toward zero and indexing with the same counter. |
| neg i_xcnt |
| .width_loop_upper8: |
| AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0 |
| add i_xcnt, 32 / xcnt_unit |
| jl .width_loop_upper8 |
| jg .width_loop_upper8_32x_end |
| .width_loop_upper8_64x_end: |
| test i_xcnt_load, 32 / xcnt_unit |
| jnz .width_loop_upper8 |
| .width_loop_upper8_32x_end: |
| AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0 |
| test i_xcnt_load, 16 / xcnt_unit |
| jz .width_loop_upper8_end |
| ; remaining 16. |
| AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0 |
| .width_loop_upper8_end: |
| lea p_cur, [p_cur + 8 * i_stride] |
| lea p_ref, [p_ref + 8 * i_stride] |
| mov i_xcnt, i_xcnt_load |
| lea r_tmp, [xcnt_unit * i_xcnt] |
| sub p_cur, r_tmp |
| sub p_ref, r_tmp |
| and i_xcnt, -(64 / xcnt_unit) |
| jz .width_loop_lower8_64x_end |
| neg i_xcnt |
| .width_loop_lower8: |
| AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1 |
| add i_xcnt, 32 / xcnt_unit |
| jl .width_loop_lower8 |
| jg .width_loop_lower8_32x_end |
| .width_loop_lower8_64x_end: |
| test i_xcnt_load, 32 / xcnt_unit |
| jnz .width_loop_lower8 |
| .width_loop_lower8_32x_end: |
| AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1 |
| test i_xcnt_load, 16 / xcnt_unit |
| jz .width_loop_lower8_end |
| ; remaining 16. |
| AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1 |
| .width_loop_lower8_end: |
| lea p_cur, [p_cur + 8 * i_stride] |
| lea p_ref, [p_ref + 8 * i_stride] |
| pop i_xcnt |
| %undef i_xcnt_load |
| %assign push_num push_num - 1 |
| lea r_tmp, [xcnt_unit * i_xcnt] |
| sub p_cur, r_tmp |
| sub p_ref, r_tmp |
| sub i_ycnt, 1 |
| jnz .height_loop |
| |
| .done: |
| mov r_tmp, p_sadframe |
| %ifdef X86_32 |
| vmovdqa xmm2, sadframe_acc |
| vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width] |
| %else |
| vextracti128 xmm2, sadframe_acc, 1 |
| vpaddd xmm2, xsadframe_acc, xmm2 |
| %endif |
| vpunpckhqdq xmm1, xmm2, xmm2 |
| vpaddd xmm2, xmm2, xmm1 |
| vmovd [r_tmp], xmm2 |
| vzeroupper |
| %ifdef X86_32 |
| STACK_DEALLOC |
| %endif |
| POPM saveregs |
| POP_XMM |
| LOAD_5_PARA_POP |
| %undef p_cur |
| %undef p_ref |
| %undef i_xcnt |
| %undef i_ycnt |
| %undef i_stride |
| %undef i_stride3 |
| %undef r_tmp |
| %undef xcnt_unit |
| %undef mm8 |
| %undef sadframe_acc |
| %undef sadframe_acc_addr |
| %undef xsadframe_acc |
| %undef prev_mad |
| %undef ymm_zero |
| %undef xmm_zero |
| %undef saveregs |
| %undef p_sadframe |
| %undef p_sad8x8 |
| %undef p_sum16x16 |
| %undef p_sqsum16x16 |
| %undef p_sqdiff16x16 |
| %undef p_sd8x8 |
| %undef p_mad8x8 |
| ret |
| |
| %endif |
| |