| ;*! |
| ;* \copy |
| ;* Copyright (c) 2009-2013, Cisco Systems |
| ;* All rights reserved. |
| ;* |
| ;* Redistribution and use in source and binary forms, with or without |
| ;* modification, are permitted provided that the following conditions |
| ;* are met: |
| ;* |
| ;* * Redistributions of source code must retain the above copyright |
| ;* notice, this list of conditions and the following disclaimer. |
| ;* |
| ;* * Redistributions in binary form must reproduce the above copyright |
| ;* notice, this list of conditions and the following disclaimer in |
| ;* the documentation and/or other materials provided with the |
| ;* distribution. |
| ;* |
| ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
| ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| ;* POSSIBILITY OF SUCH DAMAGE. |
| ;* |
| ;* |
| ;* upsampling.asm |
| ;* |
| ;* Abstract |
| ;* SIMD for pixel domain down sampling |
| ;* |
| ;* History |
| ;* 10/22/2009 Created |
| ;* |
| ;*************************************************************************/ |
| %include "asm_inc.asm" |
| |
| %ifdef __NASM_VER__ |
| %use smartalign |
| %endif |
| |
| ;*********************************************************************** |
| ; Macros and other preprocessor constants |
| ;*********************************************************************** |
| |
| |
| ;*********************************************************************** |
| ; Some constants |
| ;*********************************************************************** |
| |
| ;*********************************************************************** |
| ; Local Data (Read Only) |
| ;*********************************************************************** |
| |
| %ifdef X86_32_PICASM |
| SECTION .text align=32 |
| %else |
| SECTION .rodata align=32 |
| %endif |
| |
| ;*********************************************************************** |
| ; Various memory constants (trigonometric values or rounding values) |
| ;*********************************************************************** |
| |
| ALIGN 32 |
| %ifndef X86_32_PICASM |
| db80h_256: |
| times 32 db 80h |
| shufb_0000000088888888: |
| times 8 db 0 |
| times 8 db 8 |
| shufb_000044448888CCCC: |
| times 4 db 0 |
| times 4 db 4 |
| times 4 db 8 |
| times 4 db 12 |
| %endif |
| shufb_mask_low: |
| db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h |
| shufb_mask_high: |
| db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h |
| add_extra_half: |
| dd 16384,0,0,0 |
| |
| shufb_mask_quarter: |
| db 00h, 04h, 08h, 0ch, 80h, 80h, 80h, 80h, 01h, 05h, 09h, 0dh, 80h, 80h, 80h, 80h |
| |
| shufb_mask_onethird_low_1: |
| db 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h |
| shufb_mask_onethird_low_2: |
| db 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh, 80h, 80h, 80h, 80h, 80h |
| shufb_mask_onethird_low_3: |
| db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 01h, 04h, 07h, 0ah, 0dh |
| |
| shufb_mask_onethird_high_1: |
| db 01h, 04h, 07h, 0ah, 0dh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h |
| shufb_mask_onethird_high_2: |
| db 80h, 80h, 80h, 80h, 80h, 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h |
| shufb_mask_onethird_high_3: |
| db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh |
| |
| ;*********************************************************************** |
| ; Code |
| ;*********************************************************************** |
| |
| SECTION .text |
| |
| ;*********************************************************************** |
| ; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; const int iSrcWidth, const int iSrcHeight ); |
| ;*********************************************************************** |
| WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse |
| %ifdef X86_32 |
| push r6 |
| %assign push_num 1 |
| %else |
| %assign push_num 0 |
| %endif |
| LOAD_6_PARA |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| SIGN_EXTENSION r5, r5d |
| |
| %ifndef X86_32 |
| push r12 |
| mov r12, r4 |
| %endif |
| sar r5, $01 ; iSrcHeight >> 1 |
| |
| .yloops1: |
| %ifdef X86_32 |
| mov r4, arg5 |
| %else |
| mov r4, r12 |
| %endif |
| sar r4, $01 ; iSrcWidth >> 1 |
| mov r6, r4 ; iDstWidth restored at ebx |
| sar r4, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb |
| neg r6 ; - (iSrcWidth >> 1) |
| ; each loop = source bandwidth: 32 bytes |
| .xloops1: |
| ; 1st part horizonal loop: x16 bytes |
| ; mem hi<- ->lo |
| ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E |
| ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M |
| ;=> target: |
| ;: H G F E D C B A, P O N M L K J I |
| ;: h g f e d c b a, p o n m l k j i |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| movq mm0, [r2] ; 1st pSrc line |
| movq mm1, [r2+8] ; 1st pSrc line + 8 |
| movq mm2, [r2+r3] ; 2nd pSrc line |
| movq mm3, [r2+r3+8] ; 2nd pSrc line + 8 |
| |
| ; to handle mm0, mm1, mm2, mm3 |
| pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B |
| pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B |
| punpcklbw mm4, mm5 ; d c D C b a B A |
| pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 |
| |
| pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B |
| pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B |
| punpcklbw mm5, mm6 ; h g H G f e F E |
| pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 |
| |
| pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B |
| pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B |
| punpcklbw mm6, mm7 ; l k L K j i J I |
| pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 |
| |
| pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B |
| pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B |
| punpcklbw mm7, mm0 ; p o P O n m N M |
| pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 |
| |
| ; to handle mm4, mm5, mm6, mm7 |
| movq mm0, mm4 ; |
| punpckldq mm0, mm5 ; H G F E D C B A |
| punpckhdq mm4, mm5 ; h g f e d c b a |
| |
| movq mm1, mm6 |
| punpckldq mm1, mm7 ; P O N M L K J I |
| punpckhdq mm6, mm7 ; p o n m l k j i |
| |
| ; avg within MB horizon width (16 x 2 lines) |
| pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 |
| pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 |
| pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once |
| |
| ; 2nd part horizonal loop: x16 bytes |
| ; mem hi<- ->lo |
| ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E |
| ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M |
| ;=> target: |
| ;: H G F E D C B A, P O N M L K J I |
| ;: h g f e d c b a, p o n m l k j i |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| movq mm1, [r2+16] ; 1st pSrc line + 16 |
| movq mm2, [r2+24] ; 1st pSrc line + 24 |
| movq mm3, [r2+r3+16] ; 2nd pSrc line + 16 |
| movq mm4, [r2+r3+24] ; 2nd pSrc line + 24 |
| |
| ; to handle mm1, mm2, mm3, mm4 |
| pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B |
| pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B |
| punpcklbw mm5, mm6 ; d c D C b a B A |
| pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5 |
| |
| pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B |
| pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B |
| punpcklbw mm6, mm7 ; h g H G f e F E |
| pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6 |
| |
| pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B |
| pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B |
| punpcklbw mm7, mm1 ; l k L K j i J I |
| pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7 |
| |
| pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B |
| pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B |
| punpcklbw mm1, mm2 ; p o P O n m N M |
| pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1 |
| |
| ; to handle mm5, mm6, mm7, mm1 |
| movq mm2, mm5 |
| punpckldq mm2, mm6 ; H G F E D C B A |
| punpckhdq mm5, mm6 ; h g f e d c b a |
| |
| movq mm3, mm7 |
| punpckldq mm3, mm1 ; P O N M L K J I |
| punpckhdq mm7, mm1 ; p o n m l k j i |
| |
| ; avg within MB horizon width (16 x 2 lines) |
| pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 |
| pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 |
| pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part |
| |
| movq [r0 ], mm0 |
| movq [r0+8], mm2 |
| |
| ; next SMB |
| lea r2, [r2+32] |
| lea r0, [r0+16] |
| |
| dec r4 |
| jg near .xloops1 |
| |
| ; next line |
| lea r2, [r2+2*r3] ; next end of lines |
| lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth] |
| lea r0, [r0+r1] |
| lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] |
| |
| dec r5 |
| jg near .yloops1 |
| |
| WELSEMMS |
| %ifndef X86_32 |
| pop r12 |
| %endif |
| LOAD_6_PARA_POP |
| %ifdef X86_32 |
| pop r6 |
| %endif |
| ret |
| |
| ;*********************************************************************** |
| ; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; const int iSrcWidth, const int iSrcHeight ); |
| ;*********************************************************************** |
| WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse |
| %ifdef X86_32 |
| push r6 |
| %assign push_num 1 |
| %else |
| %assign push_num 0 |
| %endif |
| LOAD_6_PARA |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| SIGN_EXTENSION r5, r5d |
| |
| %ifndef X86_32 |
| push r12 |
| mov r12, r4 |
| %endif |
| sar r5, $01 ; iSrcHeight >> 1 |
| |
| .yloops2: |
| %ifdef X86_32 |
| mov r4, arg5 |
| %else |
| mov r4, r12 |
| %endif |
| sar r4, $01 ; iSrcWidth >> 1 |
| mov r6, r4 ; iDstWidth restored at ebx |
| sar r4, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb |
| neg r6 ; - (iSrcWidth >> 1) |
| ; each loop = source bandwidth: 16 bytes |
| .xloops2: |
| ; 1st part horizonal loop: x16 bytes |
| ; mem hi<- ->lo |
| ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E |
| ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M |
| ;=> target: |
| ;: H G F E D C B A, P O N M L K J I |
| ;: h g f e d c b a, p o n m l k j i |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| movq mm0, [r2] ; 1st pSrc line |
| movq mm1, [r2+8] ; 1st pSrc line + 8 |
| movq mm2, [r2+r3] ; 2nd pSrc line |
| movq mm3, [r2+r3+8] ; 2nd pSrc line + 8 |
| |
| ; to handle mm0, mm1, mm2, mm3 |
| pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B |
| pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B |
| punpcklbw mm4, mm5 ; d c D C b a B A |
| pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 |
| |
| pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B |
| pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B |
| punpcklbw mm5, mm6 ; h g H G f e F E |
| pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 |
| |
| pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B |
| pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B |
| punpcklbw mm6, mm7 ; l k L K j i J I |
| pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 |
| |
| pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B |
| pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B |
| punpcklbw mm7, mm0 ; p o P O n m N M |
| pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 |
| |
| ; to handle mm4, mm5, mm6, mm7 |
| movq mm0, mm4 ; |
| punpckldq mm0, mm5 ; H G F E D C B A |
| punpckhdq mm4, mm5 ; h g f e d c b a |
| |
| movq mm1, mm6 |
| punpckldq mm1, mm7 ; P O N M L K J I |
| punpckhdq mm6, mm7 ; p o n m l k j i |
| |
| ; avg within MB horizon width (16 x 2 lines) |
| pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 |
| pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 |
| pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once |
| |
| movq [r0 ], mm0 |
| |
| ; next SMB |
| lea r2, [r2+16] |
| lea r0, [r0+8] |
| |
| dec r4 |
| jg near .xloops2 |
| |
| ; next line |
| lea r2, [r2+2*r3] ; next end of lines |
| lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth] |
| lea r0, [r0+r1] |
| lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] |
| |
| dec r5 |
| jg near .yloops2 |
| |
| WELSEMMS |
| %ifndef X86_32 |
| pop r12 |
| %endif |
| LOAD_6_PARA_POP |
| %ifdef X86_32 |
| pop r6 |
| %endif |
| ret |
| |
| ;*********************************************************************** |
| ; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; const int iSrcWidth, const int iSrcHeight ); |
| ;*********************************************************************** |
| WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse |
| %ifdef X86_32 |
| push r6 |
| %assign push_num 1 |
| %else |
| %assign push_num 0 |
| %endif |
| LOAD_6_PARA |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| SIGN_EXTENSION r5, r5d |
| |
| %ifndef X86_32 |
| push r12 |
| mov r12, r4 |
| %endif |
| sar r5, $01 ; iSrcHeight >> 1 |
| |
| .yloops3: |
| %ifdef X86_32 |
| mov r4, arg5 |
| %else |
| mov r4, r12 |
| %endif |
| sar r4, $01 ; iSrcWidth >> 1 |
| mov r6, r4 ; iDstWidth restored at ebx |
| sar r4, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb |
| neg r6 ; - (iSrcWidth >> 1) |
| ; each loop = source bandwidth: 8 bytes |
| .xloops3: |
| ; 1st part horizonal loop: x8 bytes |
| ; mem hi<- ->lo |
| ;1st Line Src: mm0: d D c C b B a A |
| ;2nd Line Src: mm1: h H g G f F e E |
| ;=> target: |
| ;: H G F E D C B A |
| ;: h g f e d c b a |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| movq mm0, [r2] ; 1st pSrc line |
| movq mm1, [r2+r3] ; 2nd pSrc line |
| |
| ; to handle mm0, mm1, mm2, mm3 |
| pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B |
| pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B |
| punpcklbw mm2, mm3 ; d c D C b a B A |
| pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4 |
| |
| pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B |
| pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B |
| punpcklbw mm4, mm5 ; h g H G f e F E |
| pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5 |
| |
| ; to handle mm2, mm4 |
| movq mm0, mm2 ; |
| punpckldq mm0, mm4 ; H G F E D C B A |
| punpckhdq mm2, mm4 ; h g f e d c b a |
| |
| ; avg within MB horizon width (16 x 2 lines) |
| pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2 |
| pshufw mm1, mm0, 04eh ; 01001110 B |
| pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once |
| |
| movd [r0], mm0 |
| |
| ; next unit |
| lea r2, [r2+8] |
| lea r0, [r0+4] |
| |
| dec r4 |
| jg near .xloops3 |
| |
| ; next line |
| lea r2, [r2+2*r3] ; next end of lines |
| lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth] |
| lea r0, [r0+r1] |
| lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] |
| |
| dec r5 |
| jg near .yloops3 |
| |
| WELSEMMS |
| %ifndef X86_32 |
| pop r12 |
| %endif |
| LOAD_6_PARA_POP |
| %ifdef X86_32 |
| pop r6 |
| %endif |
| ret |
| |
| |
| |
| ;*********************************************************************** |
| ; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; const int iSrcWidth, const int iSrcHeight ); |
| ;*********************************************************************** |
| WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3 |
| %ifdef X86_32 |
| push r6 |
| %assign push_num 1 |
| %else |
| %assign push_num 0 |
| %endif |
| LOAD_6_PARA |
| PUSH_XMM 4 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| SIGN_EXTENSION r5, r5d |
| |
| %ifndef X86_32 |
| push r12 |
| mov r12, r4 |
| %endif |
| sar r5, $01 ; iSrcHeight >> 1 |
| |
| WELS_DB1 xmm3 |
| WELS_Zero xmm2 |
| sar r4, $01 ; iSrcWidth >> 1 |
| add r0, r4 ; pDst += iSrcWidth >> 1 |
| |
| .yloops4: |
| %ifdef X86_32 |
| mov r4, arg5 |
| %else |
| mov r4, r12 |
| %endif |
| sar r4, $01 ; iSrcWidth >> 1 |
| neg r4 ; -(iSrcWidth >> 1) |
| mov r6, r4 |
| align 16 |
| ; each loop = source bandwidth: 32 bytes |
| .xloops4: |
| movdqa xmm0, [r2+r3] |
| movdqa xmm1, [r2+r3+16] |
| pavgb xmm0, [r2] ; avg vertical pixels 0-15 |
| pavgb xmm1, [r2+16] ; avg vertical pixels 16-31 |
| add r2, 32 ; pSrc += 32 |
| pmaddubsw xmm0, xmm3 ; pairwise horizontal sum neighboring pixels 0-15 |
| pmaddubsw xmm1, xmm3 ; pairwise horizontal sum neighboring pixels 16-31 |
| pavgw xmm0, xmm2 ; (sum + 1) >> 1 |
| pavgw xmm1, xmm2 ; (sum + 1) >> 1 |
| packuswb xmm0, xmm1 ; pack words to bytes |
| movdqa [r0+r4], xmm0 ; store results |
| add r4, 16 |
| jl .xloops4 |
| |
| ; next line |
| lea r2, [r2+2*r3] ; next end of lines |
| lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth] |
| lea r0, [r0+r1] |
| |
| sub r5, 1 |
| jg .yloops4 |
| |
| %ifndef X86_32 |
| pop r12 |
| %endif |
| |
| POP_XMM |
| LOAD_6_PARA_POP |
| %ifdef X86_32 |
| pop r6 |
| %endif |
| ret |
| |
| ;*********************************************************************** |
| ; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; const int iSrcWidth, const int iSrcHeight ); |
| ;*********************************************************************** |
| WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3 |
| %ifdef X86_32 |
| push r6 |
| %assign push_num 1 |
| %else |
| %assign push_num 0 |
| %endif |
| LOAD_6_PARA |
| PUSH_XMM 4 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| SIGN_EXTENSION r5, r5d |
| |
| %ifndef X86_32 |
| push r12 |
| mov r12, r4 |
| %endif |
| sar r5, $01 ; iSrcHeight >> 1 |
| WELS_DB1 xmm3 |
| WELS_Zero xmm2 |
| add r2, r4 ; pSrc += iSrcWidth |
| sar r4, $01 ; iSrcWidth >> 1 |
| add r0, r4 ; pDst += iSrcWidth >> 1 |
| |
| .yloops5: |
| %ifdef X86_32 |
| mov r4, arg5 |
| %else |
| mov r4, r12 |
| %endif |
| sar r4, $01 ; iSrcWidth >> 1 |
| neg r4 ; -(iSrcWidth >> 1) |
| lea r6, [r2+r3] ; pSrc + iSrcStride |
| align 16 |
| ; each loop = source bandwidth: 16 bytes |
| .xloops5: |
| movdqa xmm0, [r2+2*r4] |
| pavgb xmm0, [r6+2*r4] ; avg vertical pixels |
| pmaddubsw xmm0, xmm3 ; pairwise horizontal sum neighboring pixels |
| pavgw xmm0, xmm2 ; (sum + 1) >> 1 |
| packuswb xmm0, xmm0 ; pack words to bytes |
| movlps [r0+r4], xmm0 ; store results |
| add r4, 8 |
| jl .xloops5 |
| |
| ; next line |
| lea r2, [r2+2*r3] ; next end of lines |
| lea r0, [r0+r1] |
| |
| sub r5, 1 |
| jg .yloops5 |
| |
| %ifndef X86_32 |
| pop r12 |
| %endif |
| |
| POP_XMM |
| LOAD_6_PARA_POP |
| %ifdef X86_32 |
| pop r6 |
| %endif |
| ret |
| |
| |
| %ifdef X86_32 |
| ;************************************************************************************************************** |
| ;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; unsigned int uiScaleX, unsigned int uiScaleY ); |
| ;{ |
| ;************************************************************************************************************** |
| |
| WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2 |
| push ebp |
| push esi |
| push edi |
| push ebx |
| %define pushsize 16 |
| %define localsize 16 |
| %define pDstData esp + pushsize + localsize + 4 |
| %define dwDstStride esp + pushsize + localsize + 8 |
| %define dwDstWidth esp + pushsize + localsize + 12 |
| %define dwDstHeight esp + pushsize + localsize + 16 |
| %define pSrcData esp + pushsize + localsize + 20 |
| %define dwSrcStride esp + pushsize + localsize + 24 |
| %define uiScaleX esp + pushsize + localsize + 28 |
| %define uiScaleY esp + pushsize + localsize + 32 |
| %define tmpHeight esp + 0 |
| %define yInverse esp + 4 |
| %define xInverse esp + 8 |
| %define dstStep esp + 12 |
| sub esp, localsize |
| |
| pxor xmm0, xmm0 |
| mov eax, [uiScaleX] |
| and eax, 32767 |
| mov ebx, eax |
| neg ebx |
| and ebx, 32767 |
| movd xmm1, eax ; uinc(uiScaleX mod 32767) |
| movd xmm2, ebx ; -uinc |
| psllq xmm1, 32 |
| por xmm1, xmm2 ; 0 0 uinc -uinc (dword) |
| pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc |
| |
| mov eax, [uiScaleY] |
| and eax, 32767 |
| mov ebx, eax |
| neg ebx |
| and ebx, 32767 |
| movd xmm6, eax ; vinc(uiScaleY mod 32767) |
| movd xmm2, ebx ; -vinc |
| psllq xmm6, 32 |
| por xmm6, xmm2 ; 0 0 vinc -vinc (dword) |
| pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc |
| |
| mov edx, 40003fffh |
| movd xmm5, edx |
| punpcklwd xmm5, xmm0 ; 16384 16383 |
| pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383 |
| |
| |
| DOWNSAMPLE: |
| |
| mov eax, [dwDstHeight] |
| mov edi, [pDstData] |
| mov edx, [dwDstStride] |
| mov ecx, [dwDstWidth] |
| sub edx, ecx |
| mov [dstStep], edx ; stride - width |
| dec eax |
| mov [tmpHeight], eax |
| mov eax, 16384 |
| mov [yInverse], eax |
| |
| pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383 |
| |
| HEIGHT: |
| mov eax, [yInverse] |
| mov esi, [pSrcData] |
| shr eax, 15 |
| mul dword [dwSrcStride] |
| add esi, eax ; get current row address |
| mov ebp, esi |
| add ebp, [dwSrcStride] |
| |
| mov eax, 16384 |
| mov [xInverse], eax |
| mov ecx, [dwDstWidth] |
| dec ecx |
| |
| movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383 |
| |
| WIDTH: |
| mov eax, [xInverse] |
| shr eax, 15 |
| |
| movd xmm1, [esi+eax] ; xxxxxxba |
| movd xmm2, [ebp+eax] ; xxxxxxdc |
| pxor xmm0, xmm0 |
| punpcklwd xmm1, xmm2 ; xxxxdcba |
| punpcklbw xmm1, xmm0 ; 0d0c0b0a |
| punpcklwd xmm1, xmm0 ; 000d000c000b000a |
| |
| movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv |
| pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 |
| movdqa xmm0, xmm2 |
| pmuludq xmm2, xmm1 |
| psrlq xmm0, 32 |
| psrlq xmm1, 32 |
| pmuludq xmm0, xmm1 |
| paddq xmm2, xmm0 |
| pshufd xmm1, xmm2, 00001110b |
| paddq xmm2, xmm1 |
| psrlq xmm2, 29 |
| |
| movd eax, xmm2 |
| inc eax |
| shr eax, 1 |
| mov [edi], al |
| inc edi |
| |
| mov eax, [uiScaleX] |
| add [xInverse], eax |
| |
| paddw xmm3, xmm7 ; inc u |
| psllw xmm3, 1 |
| psrlw xmm3, 1 |
| |
| loop WIDTH |
| |
| WIDTH_END: |
| mov eax, [xInverse] |
| shr eax, 15 |
| mov cl, [esi+eax] |
| mov [edi], cl |
| inc edi |
| |
| mov eax, [uiScaleY] |
| add [yInverse], eax |
| add edi, [dstStep] |
| |
| paddw xmm4, xmm6 ; inc v |
| psllw xmm4, 1 |
| psrlw xmm4, 1 |
| |
| dec dword [tmpHeight] |
| jg HEIGHT |
| |
| |
| LAST_ROW: |
| mov eax, [yInverse] |
| mov esi, [pSrcData] |
| shr eax, 15 |
| mul dword [dwSrcStride] |
| add esi, eax ; get current row address |
| |
| mov eax, 16384 |
| mov [xInverse], eax |
| mov ecx, [dwDstWidth] |
| |
| LAST_ROW_WIDTH: |
| mov eax, [xInverse] |
| shr eax, 15 |
| |
| mov al, [esi+eax] |
| mov [edi], al |
| inc edi |
| |
| mov eax, [uiScaleX] |
| add [xInverse], eax |
| |
| loop LAST_ROW_WIDTH |
| |
| LAST_ROW_END: |
| |
| add esp, localsize |
| pop ebx |
| pop edi |
| pop esi |
| pop ebp |
| %undef pushsize |
| %undef localsize |
| %undef pSrcData |
| %undef dwSrcWidth |
| %undef dwSrcHeight |
| %undef dwSrcStride |
| %undef pDstData |
| %undef dwDstWidth |
| %undef dwDstHeight |
| %undef dwDstStride |
| %undef uiScaleX |
| %undef uiScaleY |
| %undef tmpHeight |
| %undef yInverse |
| %undef xInverse |
| %undef dstStep |
| ret |
| |
| |
| |
| |
| ;************************************************************************************************************** |
| ;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; unsigned int uiScaleX, unsigned int uiScaleY ); |
| ;{ |
| ;************************************************************************************************************** |
| |
| WELS_EXTERN GeneralBilinearFastDownsampler_sse2 |
| push ebp |
| push esi |
| push edi |
| push ebx |
| %define pushsize 16 |
| %define localsize 16 |
| %define pDstData esp + pushsize + localsize + 4 |
| %define dwDstStride esp + pushsize + localsize + 8 |
| %define dwDstWidth esp + pushsize + localsize + 12 |
| %define dwDstHeight esp + pushsize + localsize + 16 |
| %define pSrcData esp + pushsize + localsize + 20 |
| %define dwSrcStride esp + pushsize + localsize + 24 |
| %define uiScaleX esp + pushsize + localsize + 28 |
| %define uiScaleY esp + pushsize + localsize + 32 |
| %define tmpHeight esp + 0 |
| %define yInverse esp + 4 |
| %define xInverse esp + 8 |
| %define dstStep esp + 12 |
| sub esp, localsize |
| |
| pxor xmm0, xmm0 |
| mov edx, 65535 |
| mov eax, [uiScaleX] |
| and eax, edx |
| mov ebx, eax |
| neg ebx |
| and ebx, 65535 |
| movd xmm1, eax ; uinc(uiScaleX mod 65536) |
| movd xmm2, ebx ; -uinc |
| psllq xmm1, 32 |
| por xmm1, xmm2 ; 0 uinc 0 -uinc |
| pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc |
| |
| mov eax, [uiScaleY] |
| and eax, 32767 |
| mov ebx, eax |
| neg ebx |
| and ebx, 32767 |
| movd xmm6, eax ; vinc(uiScaleY mod 32767) |
| movd xmm2, ebx ; -vinc |
| psllq xmm6, 32 |
| por xmm6, xmm2 ; 0 vinc 0 -vinc |
| pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc |
| |
| mov edx, 80007fffh ; 32768 32767 |
| movd xmm5, edx |
| pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767 |
| mov ebx, 16384 |
| |
| |
| FAST_DOWNSAMPLE: |
| |
| mov eax, [dwDstHeight] |
| mov edi, [pDstData] |
| mov edx, [dwDstStride] |
| mov ecx, [dwDstWidth] |
| sub edx, ecx |
| mov [dstStep], edx ; stride - width |
| dec eax |
| mov [tmpHeight], eax |
| mov eax, 16384 |
| mov [yInverse], eax |
| |
| pshuflw xmm4, xmm5, 01010000b |
| psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383 |
| |
| FAST_HEIGHT: |
| mov eax, [yInverse] |
| mov esi, [pSrcData] |
| shr eax, 15 |
| mul dword [dwSrcStride] |
| add esi, eax ; get current row address |
| mov ebp, esi |
| add ebp, [dwSrcStride] |
| |
| mov eax, 32768 |
| mov [xInverse], eax |
| mov ecx, [dwDstWidth] |
| dec ecx |
| |
| movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767 |
| |
| FAST_WIDTH: |
| mov eax, [xInverse] |
| shr eax, 16 |
| |
| movd xmm1, [esi+eax] ; xxxxxxba |
| movd xmm2, [ebp+eax] ; xxxxxxdc |
| punpcklwd xmm1, xmm2 ; xxxxdcba |
| punpcklbw xmm1, xmm0 ; 0d0c0b0a |
| |
| movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv |
| pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 |
| pmaddwd xmm2, xmm1 |
| pshufd xmm1, xmm2, 00000001b |
| paddd xmm2, xmm1 |
| movd xmm1, ebx |
| paddd xmm2, xmm1 |
| psrld xmm2, 15 |
| |
| packuswb xmm2, xmm0 |
| movd eax, xmm2 |
| mov [edi], al |
| inc edi |
| |
| mov eax, [uiScaleX] |
| add [xInverse], eax |
| |
| paddw xmm3, xmm7 ; inc u |
| |
| loop FAST_WIDTH |
| |
| FAST_WIDTH_END: |
| mov eax, [xInverse] |
| shr eax, 16 |
| mov cl, [esi+eax] |
| mov [edi], cl |
| inc edi |
| |
| mov eax, [uiScaleY] |
| add [yInverse], eax |
| add edi, [dstStep] |
| |
| paddw xmm4, xmm6 ; inc v |
| psllw xmm4, 1 |
| psrlw xmm4, 1 |
| |
| dec dword [tmpHeight] |
| jg FAST_HEIGHT |
| |
| |
| FAST_LAST_ROW: |
| mov eax, [yInverse] |
| mov esi, [pSrcData] |
| shr eax, 15 |
| mul dword [dwSrcStride] |
| add esi, eax ; get current row address |
| |
| mov eax, 32768 |
| mov [xInverse], eax |
| mov ecx, [dwDstWidth] |
| |
| FAST_LAST_ROW_WIDTH: |
| mov eax, [xInverse] |
| shr eax, 16 |
| |
| mov al, [esi+eax] |
| mov [edi], al |
| inc edi |
| |
| mov eax, [uiScaleX] |
| add [xInverse], eax |
| |
| loop FAST_LAST_ROW_WIDTH |
| |
| FAST_LAST_ROW_END: |
| |
| add esp, localsize |
| pop ebx |
| pop edi |
| pop esi |
| pop ebp |
| %undef pushsize |
| %undef localsize |
| %undef pSrcData |
| %undef dwSrcWidth |
| %undef dwSrcHeight |
| %undef dwSrcStride |
| %undef pDstData |
| %undef dwDstStride |
| %undef uiScaleX |
| %undef uiScaleY |
| %undef tmpHeight |
| %undef yInverse |
| %undef xInverse |
| %undef dstStep |
| ret |
| |
| %elifdef WIN64 |
| |
| ;************************************************************************************************************** |
| ;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; unsigned int uiScaleX, unsigned int uiScaleY ); |
| ;{ |
| ;************************************************************************************************************** |
| |
| WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2 |
| push r12 |
| push r13 |
| push r14 |
| push r15 |
| push rsi |
| push rdi |
| push rbx |
| push rbp |
| %assign push_num 8 |
| LOAD_7_PARA |
| PUSH_XMM 8 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r5, r5d |
| SIGN_EXTENSION r6, r6d |
| |
| pxor xmm0, xmm0 |
| mov r12d, r6d |
| and r12d, 32767 |
| mov r13d, r12d |
| neg r13d |
| and r13d, 32767 |
| movd xmm1, r12d ; uinc(uiScaleX mod 32767) |
| movd xmm2, r13d ; -uinc |
| psllq xmm1, 32 |
| por xmm1, xmm2 ; 0 0 uinc -uinc (dword) |
| pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc |
| |
| mov r12, arg8 |
| SIGN_EXTENSION r12, r12d |
| mov rbp, r12 |
| and r12d, 32767 |
| mov r13d, r12d |
| neg r13d |
| and r13d, 32767 |
| movd xmm6, r12d ; vinc(uiScaleY mod 32767) |
| movd xmm2, r13d ; -vinc |
| psllq xmm6, 32 |
| por xmm6, xmm2 ; 0 0 vinc -vinc (dword) |
| pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc |
| |
| mov r12d, 40003fffh |
| movd xmm5, r12d |
| punpcklwd xmm5, xmm0 ; 16384 16383 |
| pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383 |
| |
| DOWNSAMPLE: |
| sub r1, r2 ; stride - width |
| dec r3 |
| mov r14,16384 |
| pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383 |
| |
| HEIGHT: |
| ;mov r12, r4 |
| mov r12, r14 |
| shr r12, 15 |
| imul r12, r5 |
| add r12, r4 ; get current row address |
| mov r13, r12 |
| add r13, r5 |
| |
| mov r15, 16384 |
| mov rsi, r2 |
| dec rsi |
| movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383 |
| |
| WIDTH: |
| mov rdi, r15 |
| shr rdi, 15 |
| |
| movd xmm1, [r12+rdi] ; xxxxxxba |
| movd xmm2, [r13+rdi] ; xxxxxxdc |
| pxor xmm0, xmm0 |
| punpcklwd xmm1, xmm2 ; xxxxdcba |
| punpcklbw xmm1, xmm0 ; 0d0c0b0a |
| punpcklwd xmm1, xmm0 ; 000d000c000b000a |
| |
| movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv |
| pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 |
| movdqa xmm0, xmm2 |
| pmuludq xmm2, xmm1 |
| psrlq xmm0, 32 |
| psrlq xmm1, 32 |
| pmuludq xmm0, xmm1 |
| paddq xmm2, xmm0 |
| pshufd xmm1, xmm2, 00001110b |
| paddq xmm2, xmm1 |
| psrlq xmm2, 29 |
| |
| movd ebx, xmm2 |
| inc ebx |
| shr ebx, 1 |
| mov [r0], bl |
| inc r0 |
| |
| add r15, r6 |
| paddw xmm3, xmm7 ; inc u |
| psllw xmm3, 1 |
| psrlw xmm3, 1 |
| |
| dec rsi |
| jg WIDTH |
| |
| WIDTH_END: |
| shr r15, 15 |
| mov bl, [r12+r15] |
| mov [r0],bl |
| inc r0 |
| add r14, rbp |
| add r0, r1 |
| |
| paddw xmm4, xmm6 ; inc v |
| psllw xmm4, 1 |
| psrlw xmm4, 1 |
| |
| dec r3 |
| jg HEIGHT |
| |
| LAST_ROW: |
| shr r14, 15 |
| imul r14, r5 |
| add r4, r14 |
| mov r15, 16384 |
| |
| LAST_ROW_WIDTH: |
| mov rdi, r15 |
| shr rdi, 15 |
| mov bl, [r4+rdi] |
| mov [r0],bl |
| inc r0 |
| |
| add r15, r6 |
| dec r2 |
| jg LAST_ROW_WIDTH |
| |
| LAST_ROW_END: |
| |
| POP_XMM |
| pop rbp |
| pop rbx |
| pop rdi |
| pop rsi |
| pop r15 |
| pop r14 |
| pop r13 |
| pop r12 |
| ret |
| |
| ;************************************************************************************************************** |
| ;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; unsigned int uiScaleX, unsigned int uiScaleY ); |
| ;{ |
| ;************************************************************************************************************** |
| |
| WELS_EXTERN GeneralBilinearFastDownsampler_sse2 |
| push r12 |
| push r13 |
| push r14 |
| push r15 |
| push rsi |
| push rdi |
| push rbx |
| push rbp |
| %assign push_num 8 |
| LOAD_7_PARA |
| PUSH_XMM 8 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r5, r5d |
| SIGN_EXTENSION r6, r6d |
| |
| pxor xmm0, xmm0 |
| mov r12d, r6d |
| and r12d, 65535 |
| mov r13d, r12d |
| neg r13d |
| and r13d, 65535 |
| movd xmm1, r12d ; uinc(uiScaleX mod 65536) |
| movd xmm2, r13d ; -uinc |
| psllq xmm1, 32 |
| por xmm1, xmm2 ; 0 uinc 0 -uinc |
| pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc |
| |
| mov r12, arg8 |
| SIGN_EXTENSION r12, r12d |
| mov rbp, r12 |
| and r12d, 32767 |
| mov r13d, r12d |
| neg r13d |
| and r13d, 32767 |
| movd xmm6, r12d ; vinc(uiScaleY mod 32767) |
| movd xmm2, r13d ; -vinc |
| psllq xmm6, 32 |
| por xmm6, xmm2 ; 0 vinc 0 -vinc |
| pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc |
| |
| mov r12d, 80007fffh ; 32768 32767 |
| movd xmm5, r12d |
| pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767 |
| |
| FAST_DOWNSAMPLE: |
| sub r1, r2 ; stride - width |
| dec r3 |
| mov r14,16384 |
| |
| pshuflw xmm4, xmm5, 01010000b |
| psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383 |
| |
| FAST_HEIGHT: |
| mov r12, r14 |
| shr r12, 15 |
| imul r12, r5 |
| add r12, r4 ; get current row address |
| mov r13, r12 |
| add r13, r5 |
| |
| mov r15, 32768 |
| mov rsi, r2 |
| dec rsi |
| |
| movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767 |
| |
| FAST_WIDTH: |
| mov rdi, r15 |
| shr rdi, 16 |
| |
| movd xmm1, [r12+rdi] ; xxxxxxba |
| movd xmm2, [r13+rdi] ; xxxxxxdc |
| punpcklwd xmm1, xmm2 ; xxxxdcba |
| punpcklbw xmm1, xmm0 ; 0d0c0b0a |
| |
| movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv |
| pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 |
| pmaddwd xmm2, xmm1 |
| pshufd xmm1, xmm2, 00000001b |
| paddd xmm2, xmm1 |
| movdqa xmm1, [add_extra_half] |
| paddd xmm2, xmm1 |
| psrld xmm2, 15 |
| |
| packuswb xmm2, xmm0 |
| movd ebx, xmm2 |
| mov [r0], bl |
| inc r0 |
| |
| add r15, r6 |
| |
| paddw xmm3, xmm7 ; inc u |
| dec rsi |
| jg FAST_WIDTH |
| |
| FAST_WIDTH_END: |
| shr r15, 16 |
| mov bl, [r12+r15] |
| mov [r0],bl |
| inc r0 |
| add r14, rbp |
| add r0, r1 |
| |
| paddw xmm4, xmm6 ; inc v |
| psllw xmm4, 1 |
| psrlw xmm4, 1 |
| |
| dec r3 |
| jg FAST_HEIGHT |
| |
| |
| FAST_LAST_ROW: |
| shr r14, 15 |
| imul r14, r5 |
| add r4, r14 |
| mov r15, 32768 |
| |
| FAST_LAST_ROW_WIDTH: |
| mov rdi, r15 |
| shr rdi, 16 |
| mov bl, [r4+rdi] |
| mov [r0],bl |
| inc r0 |
| |
| add r15, r6 |
| dec r2 |
| jg FAST_LAST_ROW_WIDTH |
| |
| FAST_LAST_ROW_END: |
| |
| POP_XMM |
| pop rbp |
| pop rbx |
| pop rdi |
| pop rsi |
| pop r15 |
| pop r14 |
| pop r13 |
| pop r12 |
| ret |
| |
| %elifdef UNIX64 |
| |
| ;************************************************************************************************************** |
| ;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; unsigned int uiScaleX, unsigned int uiScaleY ); |
| ;{ |
| ;************************************************************************************************************** |
| |
| WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2 |
| push r12 |
| push r13 |
| push r14 |
| push r15 |
| push rbx |
| push rbp |
| %assign push_num 6 |
| LOAD_7_PARA |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r5, r5d |
| SIGN_EXTENSION r6, r6d |
| |
| pxor xmm0, xmm0 |
| mov r12d, r6d |
| and r12d, 32767 |
| mov r13d, r12d |
| neg r13d |
| and r13d, 32767 |
| movd xmm1, r12d ; uinc(uiScaleX mod 32767) |
| movd xmm2, r13d ; -uinc |
| psllq xmm1, 32 |
| por xmm1, xmm2 ; 0 0 uinc -uinc (dword) |
| pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc |
| |
| mov r12, arg8 |
| SIGN_EXTENSION r12, r12d |
| mov rbp, r12 |
| and r12d, 32767 |
| mov r13d, r12d |
| neg r13d |
| and r13d, 32767 |
| movd xmm6, r12d ; vinc(uiScaleY mod 32767) |
| movd xmm2, r13d ; -vinc |
| psllq xmm6, 32 |
| por xmm6, xmm2 ; 0 0 vinc -vinc (dword) |
| pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc |
| |
| mov r12d, 40003fffh |
| movd xmm5, r12d |
| punpcklwd xmm5, xmm0 ; 16384 16383 |
| pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383 |
| |
| DOWNSAMPLE: |
| sub r1, r2 ; stride - width |
| dec r3 |
| mov r14,16384 |
| pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383 |
| |
| HEIGHT: |
| ;mov r12, r4 |
| mov r12, r14 |
| shr r12, 15 |
| imul r12, r5 |
| add r12, r4 ; get current row address |
| mov r13, r12 |
| add r13, r5 |
| |
| mov r15, 16384 |
| mov rax, r2 |
| dec rax |
| movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383 |
| |
| WIDTH: |
| mov r11, r15 |
| shr r11, 15 |
| |
| movd xmm1, [r12+r11] ; xxxxxxba |
| movd xmm2, [r13+r11] ; xxxxxxdc |
| pxor xmm0, xmm0 |
| punpcklwd xmm1, xmm2 ; xxxxdcba |
| punpcklbw xmm1, xmm0 ; 0d0c0b0a |
| punpcklwd xmm1, xmm0 ; 000d000c000b000a |
| |
| movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv |
| pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 |
| movdqa xmm0, xmm2 |
| pmuludq xmm2, xmm1 |
| psrlq xmm0, 32 |
| psrlq xmm1, 32 |
| pmuludq xmm0, xmm1 |
| paddq xmm2, xmm0 |
| pshufd xmm1, xmm2, 00001110b |
| paddq xmm2, xmm1 |
| psrlq xmm2, 29 |
| |
| movd ebx, xmm2 |
| inc ebx |
| shr ebx, 1 |
| mov [r0], bl |
| inc r0 |
| |
| add r15, r6 |
| paddw xmm3, xmm7 ; inc u |
| psllw xmm3, 1 |
| psrlw xmm3, 1 |
| |
| dec rax |
| jg WIDTH |
| |
| WIDTH_END: |
| shr r15, 15 |
| mov bl, [r12+r15] |
| mov [r0],bl |
| inc r0 |
| add r14, rbp |
| add r0, r1 |
| |
| paddw xmm4, xmm6 ; inc v |
| psllw xmm4, 1 |
| psrlw xmm4, 1 |
| |
| dec r3 |
| jg HEIGHT |
| |
| LAST_ROW: |
| shr r14, 15 |
| imul r14, r5 |
| add r4, r14 |
| mov r15, 16384 |
| |
| LAST_ROW_WIDTH: |
| mov r11, r15 |
| shr r11, 15 |
| mov bl, [r4+r11] |
| mov [r0],bl |
| inc r0 |
| |
| add r15, r6 |
| dec r2 |
| jg LAST_ROW_WIDTH |
| |
| LAST_ROW_END: |
| |
| pop rbp |
| pop rbx |
| pop r15 |
| pop r14 |
| pop r13 |
| pop r12 |
| ret |
| |
| ;************************************************************************************************************** |
| ;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; unsigned int uiScaleX, unsigned int uiScaleY ); |
| ;{ |
| ;************************************************************************************************************** |
| |
| WELS_EXTERN GeneralBilinearFastDownsampler_sse2 |
| push r12 |
| push r13 |
| push r14 |
| push r15 |
| push rbx |
| push rbp |
| %assign push_num 6 |
| LOAD_7_PARA |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r5, r5d |
| SIGN_EXTENSION r6, r6d |
| |
| pxor xmm0, xmm0 |
| mov r12d, r6d |
| and r12d, 65535 |
| mov r13d, r12d |
| neg r13d |
| and r13d, 65535 |
| movd xmm1, r12d ; uinc(uiScaleX mod 65536) |
| movd xmm2, r13d ; -uinc |
| psllq xmm1, 32 |
| por xmm1, xmm2 ; 0 uinc 0 -uinc |
| pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc |
| |
| mov r12, arg8 |
| SIGN_EXTENSION r12, r12d |
| mov rbp, r12 |
| and r12d, 32767 |
| mov r13d, r12d |
| neg r13d |
| and r13d, 32767 |
| movd xmm6, r12d ; vinc(uiScaleY mod 32767) |
| movd xmm2, r13d ; -vinc |
| psllq xmm6, 32 |
| por xmm6, xmm2 ; 0 vinc 0 -vinc |
| pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc |
| |
| mov r12d, 80007fffh ; 32768 32767 |
| movd xmm5, r12d |
| pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767 |
| |
| FAST_DOWNSAMPLE: |
| sub r1, r2 ; stride - width |
| dec r3 |
| mov r14,16384 |
| |
| pshuflw xmm4, xmm5, 01010000b |
| psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383 |
| |
| FAST_HEIGHT: |
| mov r12, r14 |
| shr r12, 15 |
| imul r12, r5 |
| add r12, r4 ; get current row address |
| mov r13, r12 |
| add r13, r5 |
| |
| mov r15, 32768 |
| mov rax, r2 |
| dec rax |
| |
| movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767 |
| |
| FAST_WIDTH: |
| mov r11, r15 |
| shr r11, 16 |
| |
| movd xmm1, [r12+r11] ; xxxxxxba |
| movd xmm2, [r13+r11] ; xxxxxxdc |
| punpcklwd xmm1, xmm2 ; xxxxdcba |
| punpcklbw xmm1, xmm0 ; 0d0c0b0a |
| |
| movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv |
| pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 |
| pmaddwd xmm2, xmm1 |
| pshufd xmm1, xmm2, 00000001b |
| paddd xmm2, xmm1 |
| movdqa xmm1, [add_extra_half] |
| paddd xmm2, xmm1 |
| psrld xmm2, 15 |
| |
| packuswb xmm2, xmm0 |
| movd ebx, xmm2 |
| mov [r0], bl |
| inc r0 |
| |
| add r15, r6 |
| |
| paddw xmm3, xmm7 ; inc u |
| dec rax |
| jg FAST_WIDTH |
| |
| FAST_WIDTH_END: |
| shr r15, 16 |
| mov bl, [r12+r15] |
| mov [r0],bl |
| inc r0 |
| add r14, rbp |
| add r0, r1 |
| |
| paddw xmm4, xmm6 ; inc v |
| psllw xmm4, 1 |
| psrlw xmm4, 1 |
| |
| dec r3 |
| jg FAST_HEIGHT |
| |
| |
| FAST_LAST_ROW: |
| shr r14, 15 |
| imul r14, r5 |
| add r4, r14 |
| mov r15, 32768 |
| |
| FAST_LAST_ROW_WIDTH: |
| mov r11, r15 |
| shr r11, 16 |
| mov bl, [r4+r11] |
| mov [r0],bl |
| inc r0 |
| |
| add r15, r6 |
| dec r2 |
| jg FAST_LAST_ROW_WIDTH |
| |
| FAST_LAST_ROW_END: |
| |
| pop rbp |
| pop rbx |
| pop r15 |
| pop r14 |
| pop r13 |
| pop r12 |
| ret |
| %endif |
| |
| ;*********************************************************************** |
| ; void DyadicBilinearOneThirdDownsampler_ssse3( unsigned char* pDst, const int iDstStride, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; const int iSrcWidth, const int iSrcHeight ); |
| ;*********************************************************************** |
| WELS_EXTERN DyadicBilinearOneThirdDownsampler_ssse3 |
| %ifdef X86_32 |
| push r6 |
| %assign push_num 1 |
| %else |
| %assign push_num 0 |
| %endif |
| LOAD_6_PARA |
| PUSH_XMM 8 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| SIGN_EXTENSION r5, r5d |
| %ifdef X86_32_PICASM |
| %define i_height dword arg6 |
| %else |
| %define i_height r5 |
| %endif |
| INIT_X86_32_PIC_NOPRESERVE r5 |
| |
| %ifndef X86_32 |
| push r12 |
| mov r12, r4 |
| %endif |
| |
| mov r6, r1 ;Save the tailer for the unasigned size |
| imul r6, i_height |
| add r6, r0 |
| movdqa xmm7, [r6] |
| |
| .yloops_onethird_sse3: |
| %ifdef X86_32 |
| mov r4, arg5 |
| %else |
| mov r4, r12 |
| %endif |
| |
| mov r6, r0 ;save base address |
| ; each loop = source bandwidth: 48 bytes |
| .xloops_onethird_sse3: |
| ; 1st part horizonal loop: x48 bytes |
| ; mem hi<- ->lo |
| ;1st Line Src: xmm0: F * e E * d D * c C * b B * a A |
| ; xmm2: k K * j J * i I * h H * g G * f |
| ; xmm2: * p P * o O * n N * m M * l L * |
| ; |
| ;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A' |
| ; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f' |
| ; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' * |
| ;=> target: |
| ;: P O N M L K J I H G F E D C B A |
| ;: p o n m l k j i h g f e d c b a |
| ;: P' .. A' |
| ;: p' .. a' |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ;1st line |
| movdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A |
| movdqa xmm1, xmm0 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_1)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_1)] |
| pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0 |
| pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1 |
| |
| movdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f |
| movdqa xmm3, xmm2 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_2)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_2)] |
| pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2 |
| pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3 |
| |
| paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0 |
| paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1 |
| |
| movdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L * |
| movdqa xmm3, xmm2 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_3)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_3)] |
| pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2 |
| pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3 |
| |
| paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0 |
| paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1 |
| pavgb xmm0, xmm1 ;1st line average -> xmm0 |
| |
| ;2nd line |
| movdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A' |
| movdqa xmm3, xmm2 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_1)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_1)] |
| pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2 |
| pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3 |
| |
| movdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f' |
| movdqa xmm4, xmm1 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_2)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_2)] |
| pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1 |
| pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4 |
| |
| paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2 |
| paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3 |
| |
| movdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' * |
| movdqa xmm4, xmm1 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_3)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_3)] |
| pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1 |
| pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4 |
| |
| paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2 |
| paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3 |
| pavgb xmm2, xmm3 ;2nd line average -> xmm2 |
| |
| pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line ) |
| |
| ; write pDst |
| movdqa [r0], xmm0 ;write result in dst |
| |
| ; next SMB |
| lea r2, [r2+48] ;current src address |
| lea r0, [r0+16] ;current dst address |
| |
| sub r4, 48 ;xloops counter |
| cmp r4, 0 |
| jg near .xloops_onethird_sse3 |
| |
| sub r6, r0 ;offset = base address - current address |
| lea r2, [r2+2*r3] ; |
| lea r2, [r2+r3] ; |
| lea r2, [r2+2*r6] ;current line + 3 lines |
| lea r2, [r2+r6] |
| lea r0, [r0+r1] |
| lea r0, [r0+r6] ;current dst lien + 1 line |
| |
| dec i_height |
| jg near .yloops_onethird_sse3 |
| |
| movdqa [r0], xmm7 ;restore the tailer for the unasigned size |
| |
| %ifndef X86_32 |
| pop r12 |
| %endif |
| |
| DEINIT_X86_32_PIC |
| POP_XMM |
| LOAD_6_PARA_POP |
| %ifdef X86_32 |
| pop r6 |
| %endif |
| ret |
| %undef i_height |
| |
| ;*********************************************************************** |
| ; void DyadicBilinearOneThirdDownsampler_sse4( unsigned char* pDst, const int iDstStride, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; const int iSrcWidth, const int iSrcHeight ); |
| ;*********************************************************************** |
| WELS_EXTERN DyadicBilinearOneThirdDownsampler_sse4 |
| %ifdef X86_32 |
| push r6 |
| %assign push_num 1 |
| %else |
| %assign push_num 0 |
| %endif |
| LOAD_6_PARA |
| PUSH_XMM 8 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| SIGN_EXTENSION r5, r5d |
| %ifdef X86_32_PICASM |
| %define i_height dword arg6 |
| %else |
| %define i_height r5 |
| %endif |
| INIT_X86_32_PIC_NOPRESERVE r5 |
| |
| %ifndef X86_32 |
| push r12 |
| mov r12, r4 |
| %endif |
| |
| mov r6, r1 ;Save the tailer for the unasigned size |
| imul r6, i_height |
| add r6, r0 |
| movdqa xmm7, [r6] |
| |
| .yloops_onethird_sse4: |
| %ifdef X86_32 |
| mov r4, arg5 |
| %else |
| mov r4, r12 |
| %endif |
| |
| mov r6, r0 ;save base address |
| ; each loop = source bandwidth: 48 bytes |
| .xloops_onethird_sse4: |
| ; 1st part horizonal loop: x48 bytes |
| ; mem hi<- ->lo |
| ;1st Line Src: xmm0: F * e E * d D * c C * b B * a A |
| ; xmm2: k K * j J * i I * h H * g G * f |
| ; xmm2: * p P * o O * n N * m M * l L * |
| ; |
| ;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A' |
| ; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f' |
| ; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' * |
| ;=> target: |
| ;: P O N M L K J I H G F E D C B A |
| ;: p o n m l k j i h g f e d c b a |
| ;: P' .. A' |
| ;: p' .. a' |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ;1st line |
| movntdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A |
| movdqa xmm1, xmm0 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_1)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_1)] |
| pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0 |
| pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1 |
| |
| movntdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f |
| movdqa xmm3, xmm2 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_2)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_2)] |
| pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2 |
| pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3 |
| |
| paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0 |
| paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1 |
| |
| movntdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L * |
| movdqa xmm3, xmm2 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_3)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_3)] |
| pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2 |
| pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3 |
| |
| paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0 |
| paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1 |
| pavgb xmm0, xmm1 ;1st line average -> xmm0 |
| |
| ;2nd line |
| movntdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A' |
| movdqa xmm3, xmm2 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_1)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_1)] |
| pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2 |
| pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3 |
| |
| movntdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f' |
| movdqa xmm4, xmm1 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_2)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_2)] |
| pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1 |
| pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4 |
| |
| paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2 |
| paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3 |
| |
| movntdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' * |
| movdqa xmm4, xmm1 |
| movdqa xmm5, [pic(shufb_mask_onethird_low_3)] |
| movdqa xmm6, [pic(shufb_mask_onethird_high_3)] |
| pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1 |
| pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4 |
| |
| paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2 |
| paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3 |
| pavgb xmm2, xmm3 ;2nd line average -> xmm2 |
| |
| pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line ) |
| |
| ; write pDst |
| movdqa [r0], xmm0 ;write result in dst |
| |
| ; next SMB |
| lea r2, [r2+48] ;current src address |
| lea r0, [r0+16] ;current dst address |
| |
| sub r4, 48 ;xloops counter |
| cmp r4, 0 |
| jg near .xloops_onethird_sse4 |
| |
| sub r6, r0 ;offset = base address - current address |
| lea r2, [r2+2*r3] ; |
| lea r2, [r2+r3] ; |
| lea r2, [r2+2*r6] ;current line + 3 lines |
| lea r2, [r2+r6] |
| lea r0, [r0+r1] |
| lea r0, [r0+r6] ;current dst lien + 1 line |
| |
| dec i_height |
| jg near .yloops_onethird_sse4 |
| |
| movdqa [r0], xmm7 ;restore the tailer for the unasigned size |
| |
| %ifndef X86_32 |
| pop r12 |
| %endif |
| |
| DEINIT_X86_32_PIC |
| POP_XMM |
| LOAD_6_PARA_POP |
| %ifdef X86_32 |
| pop r6 |
| %endif |
| ret |
| %undef i_height |
| |
| ;*********************************************************************** |
| ; void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; const int iSrcWidth, const int iSrcHeight ); |
| ;*********************************************************************** |
| WELS_EXTERN DyadicBilinearQuarterDownsampler_sse |
| %ifdef X86_32 |
| push r6 |
| %assign push_num 1 |
| %else |
| %assign push_num 0 |
| %endif |
| LOAD_6_PARA |
| PUSH_XMM 8 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| SIGN_EXTENSION r5, r5d |
| |
| %ifndef X86_32 |
| push r12 |
| mov r12, r4 |
| %endif |
| sar r5, $02 ; iSrcHeight >> 2 |
| |
| mov r6, r1 ;Save the tailer for the unasigned size |
| imul r6, r5 |
| add r6, r0 |
| movq xmm7, [r6] |
| |
| .yloops_quarter_sse: |
| %ifdef X86_32 |
| mov r4, arg5 |
| %else |
| mov r4, r12 |
| %endif |
| |
| mov r6, r0 ;save base address |
| ; each loop = source bandwidth: 32 bytes |
| .xloops_quarter_sse: |
| ; 1st part horizonal loop: x16 bytes |
| ; mem hi<- ->lo |
| ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E |
| ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M |
| ; |
| ;=> target: |
| ;: G E C A, |
| ;: |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| movq mm0, [r2] ; 1st pSrc line |
| movq mm1, [r2+8] ; 1st pSrc line + 8 |
| movq mm2, [r2+r3] ; 2nd pSrc line |
| movq mm3, [r2+r3+8] ; 2nd pSrc line + 8 |
| |
| pshufw mm0, mm0, 0d8h ; x X x X c C a A |
| pshufw mm1, mm1, 0d8h ; x X x X g G e E |
| pshufw mm2, mm2, 0d8h ; x X x X k K i I |
| pshufw mm3, mm3, 0d8h ; x X x X o O m M |
| |
| punpckldq mm0, mm1 ; g G e E c C a A |
| punpckldq mm2, mm3 ; o O m M k K i I |
| |
| ; to handle mm0,mm2 |
| pshufw mm4, mm0, 0d8h ;g G c C e E a A |
| pshufw mm5, mm4, 04eh ;e E a A g G c C |
| punpcklbw mm4, mm5 ;g e G E c a C A -> mm4 |
| pshufw mm4, mm4, 0d8h ;g e c a G E C A -> mm4 |
| |
| pshufw mm5, mm2, 0d8h ;o O k K m M i I |
| pshufw mm6, mm5, 04eh ;m M i I o O k K |
| punpcklbw mm5, mm6 ;o m O M k i K I |
| pshufw mm5, mm5, 0d8h ;o m k i O M K I -> mm5 |
| |
| ; to handle mm4, mm5 |
| movq mm0, mm4 |
| punpckldq mm0, mm6 ;x x x x G E C A |
| punpckhdq mm4, mm6 ;x x x x g e c a |
| |
| movq mm1, mm5 |
| punpckldq mm1, mm6 ;x x x x O M K I |
| punpckhdq mm5, mm6 ;x x x x o m k i |
| |
| ; avg within MB horizon width (8 x 2 lines) |
| pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 |
| pavgb mm1, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 |
| pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once |
| |
| ; 2nd part horizonal loop: x16 bytes |
| movq mm1, [r2+16] ; 1st pSrc line + 16 |
| movq mm2, [r2+24] ; 1st pSrc line + 24 |
| movq mm3, [r2+r3+16] ; 2nd pSrc line + 16 |
| movq mm4, [r2+r3+24] ; 2nd pSrc line + 24 |
| |
| pshufw mm1, mm1, 0d8h |
| pshufw mm2, mm2, 0d8h |
| pshufw mm3, mm3, 0d8h |
| pshufw mm4, mm4, 0d8h |
| |
| punpckldq mm1, mm2 |
| punpckldq mm3, mm4 |
| |
| ; to handle mm1, mm3 |
| pshufw mm4, mm1, 0d8h |
| pshufw mm5, mm4, 04eh |
| punpcklbw mm4, mm5 |
| pshufw mm4, mm4, 0d8h |
| |
| pshufw mm5, mm3, 0d8h |
| pshufw mm6, mm5, 04eh |
| punpcklbw mm5, mm6 |
| pshufw mm5, mm5, 0d8h |
| |
| ; to handle mm4, mm5 |
| movq mm2, mm4 |
| punpckldq mm2, mm6 |
| punpckhdq mm4, mm6 |
| |
| movq mm3, mm5 |
| punpckldq mm3, mm6 |
| punpckhdq mm5, mm6 |
| |
| ; avg within MB horizon width (8 x 2 lines) |
| pavgb mm2, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 |
| pavgb mm3, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 |
| pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part |
| |
| movd [r0 ], mm0 |
| movd [r0+4], mm2 |
| |
| ; next SMB |
| lea r2, [r2+32] |
| lea r0, [r0+8] |
| |
| sub r4, 32 |
| cmp r4, 0 |
| jg near .xloops_quarter_sse |
| |
| sub r6, r0 |
| ; next line |
| lea r2, [r2+4*r3] ; next 4 end of lines |
| lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth] |
| lea r0, [r0+r1] |
| lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] |
| |
| dec r5 |
| jg near .yloops_quarter_sse |
| |
| movq [r0], xmm7 ;restored the tailer for the unasigned size |
| |
| WELSEMMS |
| %ifndef X86_32 |
| pop r12 |
| %endif |
| POP_XMM |
| LOAD_6_PARA_POP |
| %ifdef X86_32 |
| pop r6 |
| %endif |
| ret |
| |
| ;*********************************************************************** |
| ; void DyadicBilinearQuarterDownsampler_ssse3( unsigned char* pDst, const int iDstStride, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; const int iSrcWidth, const int iSrcHeight ); |
| ;*********************************************************************** |
| WELS_EXTERN DyadicBilinearQuarterDownsampler_ssse3 |
| ;push ebx |
| ;push edx |
| ;push esi |
| ;push edi |
| ;push ebp |
| |
| ;mov edi, [esp+24] ; pDst |
| ;mov edx, [esp+28] ; iDstStride |
| ;mov esi, [esp+32] ; pSrc |
| ;mov ecx, [esp+36] ; iSrcStride |
| ;mov ebp, [esp+44] ; iSrcHeight |
| %ifdef X86_32 |
| push r6 |
| %assign push_num 1 |
| %else |
| %assign push_num 0 |
| %endif |
| LOAD_6_PARA |
| PUSH_XMM 8 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| SIGN_EXTENSION r5, r5d |
| |
| %ifndef X86_32 |
| push r12 |
| mov r12, r4 |
| %endif |
| sar r5, $02 ; iSrcHeight >> 2 |
| |
| mov r6, r1 ;Save the tailer for the unasigned size |
| imul r6, r5 |
| add r6, r0 |
| movq xmm7, [r6] |
| |
| INIT_X86_32_PIC_NOPRESERVE r4 |
| movdqa xmm6, [pic(shufb_mask_quarter)] |
| DEINIT_X86_32_PIC |
| |
| .yloops_quarter_sse3: |
| ;mov eax, [esp+40] ; iSrcWidth |
| ;sar eax, $02 ; iSrcWidth >> 2 |
| ;mov ebx, eax ; iDstWidth restored at ebx |
| ;sar eax, $04 ; (iSrcWidth >> 2) / 16 ; loop count = num_of_mb |
| ;neg ebx ; - (iSrcWidth >> 2) |
| %ifdef X86_32 |
| mov r4, arg5 |
| %else |
| mov r4, r12 |
| %endif |
| |
| mov r6, r0 |
| ; each loop = source bandwidth: 32 bytes |
| .xloops_quarter_sse3: |
| ; 1st part horizonal loop: x32 bytes |
| ; mem hi<- ->lo |
| ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A |
| ; xmm1: p P o O n N m M l L k K j J i I |
| ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A |
| ; xmm3: p P o O n N m M l L k K j J i I |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| movdqa xmm0, [r2] ; 1st_src_line |
| movdqa xmm1, [r2+16] ; 1st_src_line + 16 |
| movdqa xmm2, [r2+r3] ; 2nd_src_line |
| movdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16 |
| |
| pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A |
| pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I |
| pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A |
| pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I |
| |
| movdqa xmm4, xmm0 |
| movdqa xmm5, xmm2 |
| punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0 |
| punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4 |
| punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2 |
| punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5 |
| |
| pavgb xmm0, xmm4 |
| pavgb xmm2, xmm5 |
| pavgb xmm0, xmm2 ;average |
| |
| ; write pDst |
| movq [r0], xmm0 |
| |
| ; next SMB |
| lea r2, [r2+32] |
| lea r0, [r0+8] |
| |
| sub r4, 32 |
| cmp r4, 0 |
| jg near .xloops_quarter_sse3 |
| |
| sub r6, r0 |
| ; next line |
| lea r2, [r2+4*r3] ; next end of lines |
| lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth] |
| lea r0, [r0+r1] |
| lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] |
| |
| dec r5 |
| jg near .yloops_quarter_sse3 |
| |
| movq [r0], xmm7 ;restored the tailer for the unasigned size |
| |
| %ifndef X86_32 |
| pop r12 |
| %endif |
| |
| POP_XMM |
| LOAD_6_PARA_POP |
| %ifdef X86_32 |
| pop r6 |
| %endif |
| ret |
| |
| ;*********************************************************************** |
| ; void DyadicBilinearQuarterDownsampler_sse4( unsigned char* pDst, const int iDstStride, |
| ; unsigned char* pSrc, const int iSrcStride, |
| ; const int iSrcWidth, const int iSrcHeight ); |
| ;*********************************************************************** |
| WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4 |
| %ifdef X86_32 |
| push r6 |
| %assign push_num 1 |
| %else |
| %assign push_num 0 |
| %endif |
| LOAD_6_PARA |
| PUSH_XMM 8 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r4, r4d |
| SIGN_EXTENSION r5, r5d |
| |
| %ifndef X86_32 |
| push r12 |
| mov r12, r4 |
| %endif |
| sar r5, $02 ; iSrcHeight >> 2 |
| |
| mov r6, r1 ;Save the tailer for the unasigned size |
| imul r6, r5 |
| add r6, r0 |
| movq xmm7, [r6] |
| |
| INIT_X86_32_PIC_NOPRESERVE r4 |
| movdqa xmm6, [pic(shufb_mask_quarter)] ;mask |
| DEINIT_X86_32_PIC |
| |
| .yloops_quarter_sse4: |
| %ifdef X86_32 |
| mov r4, arg5 |
| %else |
| mov r4, r12 |
| %endif |
| |
| mov r6, r0 |
| ; each loop = source bandwidth: 32 bytes |
| .xloops_quarter_sse4: |
| ; 1st part horizonal loop: x16 bytes |
| ; mem hi<- ->lo |
| ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A |
| ; xmm1: p P o O n N m M l L k K j J i I |
| ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A |
| ; xmm3: p P o O n N m M l L k K j J i I |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| movntdqa xmm0, [r2] ; 1st_src_line |
| movntdqa xmm1, [r2+16] ; 1st_src_line + 16 |
| movntdqa xmm2, [r2+r3] ; 2nd_src_line |
| movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16 |
| |
| pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A |
| pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I |
| pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A |
| pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I |
| |
| movdqa xmm4, xmm0 |
| movdqa xmm5, xmm2 |
| punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0 |
| punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4 |
| punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2 |
| punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5 |
| |
| pavgb xmm0, xmm4 |
| pavgb xmm2, xmm5 |
| pavgb xmm0, xmm2 ;average |
| |
| ; write pDst |
| movq [r0], xmm0 |
| |
| ; next SMB |
| lea r2, [r2+32] |
| lea r0, [r0+8] |
| |
| sub r4, 32 |
| cmp r4, 0 |
| jg near .xloops_quarter_sse4 |
| |
| sub r6, r0 |
| lea r2, [r2+4*r3] ; next end of lines |
| lea r2, [r2+4*r6] ; reset to base 0 [- 2 * iDstWidth] |
| lea r0, [r0+r1] |
| lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] |
| |
| dec r5 |
| jg near .yloops_quarter_sse4 |
| |
| movq [r0], xmm7 ;restore the tailer for the unasigned size |
| |
| %ifndef X86_32 |
| pop r12 |
| %endif |
| |
| POP_XMM |
| LOAD_6_PARA_POP |
| %ifdef X86_32 |
| pop r6 |
| %endif |
| ret |
| |
| ; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5 |
| %macro SSE2_BilinearIncXposuw 5 |
| movdqa %5, %2 |
| paddw %2, %4 |
| paddusw %5, %4 |
| pcmpeqw %5, %2 |
| paddb %1, %3 |
| paddb %1, %5 ; subtract 1 if no carry |
| %endmacro |
| |
| ; outl=%1 outh=%2 in=%3 |
| %macro SSE2_UnpckXFracuw 3 |
| pcmpeqw %1, %1 |
| pxor %1, %3 |
| movdqa %2, %1 |
| punpcklwd %1, %3 |
| punpckhwd %2, %3 |
| %endmacro |
| |
| ; [in:xfrac out:xyfrac0]=%1 [out:xyfrac1]=%2 yfrac0=%3 yfrac1=%4 |
| %macro SSE2_BilinearFastCalcXYFrac 4 |
| movdqa %2, %1 |
| pmulhuw %1, %3 |
| pmulhuw %2, %4 |
| %endmacro |
| |
| ; [in:dwordsl out:bytes] dwordsh=%2 zero=%3 |
| %macro SSE2_BilinearFastPackDwordsToBytes 3 |
| psrld %1, 14 |
| psrld %2, 14 |
| packssdw %1, %2 |
| pavgw %1, %3 |
| packuswb %1, %1 |
| %endmacro |
| |
| %macro SSSE3_BilinearFastDownsample2xOrLess_8px 0 |
| movdqa xmm_tmp0, xmm_xpos_int |
| pshufb xmm_tmp0, xmm_0 |
| psubb xmm_xpos_int, xmm_tmp0 |
| SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac |
| mov r_tmp0, i_xpos |
| lea i_xpos, [i_xpos + 8 * i_scalex] |
| shr r_tmp0, 16 |
| lddqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| pshufb xmm_tmp4, xmm_xpos_int |
| movdqa xmm_tmp5, xmm_tmp4 |
| punpcklbw xmm_tmp4, xmm_0 |
| punpckhbw xmm_tmp5, xmm_0 |
| SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1 |
| SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp3, xmm_yfrac0, xmm_yfrac1 |
| pmaddwd xmm_tmp0, xmm_tmp4 |
| pmaddwd xmm_tmp1, xmm_tmp5 |
| lddqu xmm_tmp4, [p_src_row1 + r_tmp0] |
| pshufb xmm_tmp4, xmm_xpos_int |
| movdqa xmm_tmp5, xmm_tmp4 |
| punpcklbw xmm_tmp4, xmm_0 |
| punpckhbw xmm_tmp5, xmm_0 |
| pmaddwd xmm_tmp2, xmm_tmp4 |
| pmaddwd xmm_tmp3, xmm_tmp5 |
| paddd xmm_tmp0, xmm_tmp2 |
| paddd xmm_tmp1, xmm_tmp3 |
| SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0 |
| movlps [p_dst], xmm_tmp0 |
| add p_dst, 8 |
| SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0 |
| %endmacro |
| |
| %macro SSSE3_BilinearFastDownsample4xOrLess_8px 0 |
| movdqa xmm_tmp0, xmm_xpos_int |
| pshufb xmm_tmp0, xmm_shufb_0000000088888888 |
| psubb xmm_xpos_int, xmm_tmp0 |
| SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| lddqu xmm_tmp3, [p_src_row0 + r_tmp0] |
| lddqu xmm_tmp4, [p_src_row1 + r_tmp0] |
| movdqa xmm_tmp2, xmm_xpos_int |
| punpcklbw xmm_tmp2, xmm_db80h |
| pshufb xmm_tmp3, xmm_tmp2 |
| pshufb xmm_tmp4, xmm_tmp2 |
| SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1 |
| pmaddwd xmm_tmp0, xmm_tmp3 |
| pmaddwd xmm_tmp2, xmm_tmp4 |
| paddd xmm_tmp0, xmm_tmp2 |
| lea r_tmp0, [i_xpos + 4 * i_scalex] |
| lea i_xpos, [i_xpos + 8 * i_scalex] |
| shr r_tmp0, 16 |
| lddqu xmm_tmp3, [p_src_row0 + r_tmp0] |
| lddqu xmm_tmp4, [p_src_row1 + r_tmp0] |
| movdqa xmm_tmp2, xmm_xpos_int |
| punpckhbw xmm_tmp2, xmm_db80h |
| pshufb xmm_tmp3, xmm_tmp2 |
| pshufb xmm_tmp4, xmm_tmp2 |
| SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1 |
| pmaddwd xmm_tmp1, xmm_tmp3 |
| pmaddwd xmm_tmp2, xmm_tmp4 |
| paddd xmm_tmp1, xmm_tmp2 |
| SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0 |
| movlps [p_dst], xmm_tmp0 |
| add p_dst, 8 |
| SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0 |
| %endmacro |
| |
| %macro SSE2_GeneralBilinearFastDownsample_8px 0 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| movd xmm_tmp3, [p_src_row0 + r_tmp0] |
| movd xmm_tmp4, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + i_scalex] |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 1 |
| pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 1 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 2 |
| pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 2 |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 3 |
| pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 3 |
| punpcklbw xmm_tmp3, xmm_0 |
| punpcklbw xmm_tmp4, xmm_0 |
| movdqa xmm_tmp0, xmm_xfrac0 |
| SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1 |
| pmaddwd xmm_tmp0, xmm_tmp3 |
| pmaddwd xmm_tmp2, xmm_tmp4 |
| paddd xmm_tmp0, xmm_tmp2 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| movd xmm_tmp3, [p_src_row0 + r_tmp0] |
| movd xmm_tmp4, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + i_scalex] |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 1 |
| pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 1 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 2 |
| pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 2 |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 3 |
| pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 3 |
| punpcklbw xmm_tmp3, xmm_0 |
| punpcklbw xmm_tmp4, xmm_0 |
| movdqa xmm_tmp1, xmm_xfrac1 |
| SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1 |
| pmaddwd xmm_tmp1, xmm_tmp3 |
| pmaddwd xmm_tmp2, xmm_tmp4 |
| paddd xmm_tmp1, xmm_tmp2 |
| SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0 |
| movlps [p_dst], xmm_tmp0 |
| add p_dst, 8 |
| paddw xmm_xfrac0, xmm_xfrac_inc |
| paddw xmm_xfrac1, xmm_xfrac_inc |
| %endmacro |
| |
| ; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6 |
| %macro SSE2_BilinearIncXposw 6 |
| pxor %6, %6 |
| paddw %2, %4 |
| pcmpgtw %6, %2 |
| paddb %1, %3 |
| psubb %1, %6 ; add carry |
| pand %2, %5 |
| %endmacro |
| |
| ; outl=%1 outh=%2 in=%3 7FFFh=%4 |
| %macro SSE2_UnpckXFracw 4 |
| movdqa %1, %3 |
| pxor %1, %4 |
| movdqa %2, %1 |
| punpcklwd %1, %3 |
| punpckhwd %2, %3 |
| %endmacro |
| |
| ; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6 |
| %macro SSE41_LinearAccurateInterpolateVerticalDwords 6 |
| pshufd %1, %2, 10110001b |
| pshufd %6, %3, 10110001b |
| pmuludq %1, %4 |
| pmuludq %6, %5 |
| paddq %1, %6 |
| pmuludq %2, %4 |
| pmuludq %3, %5 |
| paddq %2, %3 |
| psllq %1, 3 |
| psrlq %2, 29 |
| blendps %1, %2, 0101b |
| %endmacro |
| |
| %macro SSE41_BilinearAccurateDownsample2xOrLess_8px 0 |
| movdqa xmm_tmp0, xmm_xpos_int |
| pshufb xmm_tmp0, xmm_0 |
| psubb xmm_xpos_int, xmm_tmp0 |
| SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff |
| mov r_tmp0, i_xpos |
| lea i_xpos, [i_xpos + 8 * i_scalex] |
| shr r_tmp0, 16 |
| lddqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| pshufb xmm_tmp4, xmm_xpos_int |
| movdqa xmm_tmp5, xmm_tmp4 |
| punpcklbw xmm_tmp4, xmm_0 |
| punpckhbw xmm_tmp5, xmm_0 |
| pmaddwd xmm_tmp4, xmm_tmp0 |
| pmaddwd xmm_tmp5, xmm_tmp1 |
| lddqu xmm_tmp2, [p_src_row1 + r_tmp0] |
| pshufb xmm_tmp2, xmm_xpos_int |
| movdqa xmm_tmp3, xmm_tmp2 |
| punpcklbw xmm_tmp2, xmm_0 |
| punpckhbw xmm_tmp3, xmm_0 |
| pmaddwd xmm_tmp2, xmm_tmp0 |
| pmaddwd xmm_tmp3, xmm_tmp1 |
| SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp1 |
| SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp5, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2 |
| packssdw xmm_tmp0, xmm_tmp1 |
| pavgw xmm_tmp0, xmm_0 |
| packuswb xmm_tmp0, xmm_tmp0 |
| movlps [p_dst], xmm_tmp0 |
| add p_dst, 8 |
| SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0 |
| %endmacro |
| |
| %macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0 |
| movdqa xmm_tmp0, xmm_xpos_int |
| pshufb xmm_tmp0, xmm_shufb_0000000088888888 |
| psubb xmm_xpos_int, xmm_tmp0 |
| SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| movdqa xmm_tmp3, xmm_xpos_int |
| punpcklbw xmm_tmp3, xmm_db80h |
| lddqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| lddqu xmm_tmp2, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex] |
| lea i_xpos, [i_xpos + 8 * i_scalex] |
| shr r_tmp0, 16 |
| pshufb xmm_tmp4, xmm_tmp3 |
| pshufb xmm_tmp2, xmm_tmp3 |
| pmaddwd xmm_tmp4, xmm_tmp0 |
| pmaddwd xmm_tmp2, xmm_tmp0 |
| SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3 |
| movdqa xmm_tmp2, xmm_xpos_int |
| punpckhbw xmm_tmp2, xmm_db80h |
| lddqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| lddqu xmm_tmp3, [p_src_row1 + r_tmp0] |
| pshufb xmm_tmp4, xmm_tmp2 |
| pshufb xmm_tmp3, xmm_tmp2 |
| pmaddwd xmm_tmp4, xmm_tmp1 |
| pmaddwd xmm_tmp3, xmm_tmp1 |
| SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2 |
| packssdw xmm_tmp0, xmm_tmp1 |
| pavgw xmm_tmp0, xmm_0 |
| packuswb xmm_tmp0, xmm_tmp0 |
| movlps [p_dst], xmm_tmp0 |
| add p_dst, 8 |
| SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0 |
| %endmacro |
| |
| %macro SSE41_GeneralBilinearAccurateDownsample_8px 0 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| movd xmm_tmp4, [p_src_row0 + r_tmp0] |
| movd xmm_tmp2, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 1 * i_scalex] |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 1 |
| pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 1 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 2 |
| pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 2 |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 3 |
| pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 3 |
| punpcklbw xmm_tmp4, xmm_0 |
| punpcklbw xmm_tmp2, xmm_0 |
| pmaddwd xmm_tmp4, xmm_xfrac0 |
| pmaddwd xmm_tmp2, xmm_xfrac0 |
| SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| movd xmm_tmp4, [p_src_row0 + r_tmp0] |
| movd xmm_tmp3, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 1 * i_scalex] |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 1 |
| pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 1 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 2 |
| pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 2 |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 3 |
| pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 3 |
| punpcklbw xmm_tmp4, xmm_0 |
| punpcklbw xmm_tmp3, xmm_0 |
| pmaddwd xmm_tmp4, xmm_xfrac1 |
| pmaddwd xmm_tmp3, xmm_xfrac1 |
| SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2 |
| packssdw xmm_tmp0, xmm_tmp1 |
| pavgw xmm_tmp0, xmm_0 |
| packuswb xmm_tmp0, xmm_tmp0 |
| movlps [p_dst], xmm_tmp0 |
| add p_dst, 8 |
| paddw xmm_xfrac0, xmm_xfrac_inc |
| paddw xmm_xfrac1, xmm_xfrac_inc |
| pand xmm_xfrac0, xmm_7fff |
| pand xmm_xfrac1, xmm_7fff |
| %endmacro |
| |
| ; downsample_8px_macro=%1 b_fast=%2 |
| %macro SSE2_GeneralBilinearDownsampler_loop 2 |
| %%height: |
| mov p_src_row0, i_ypos |
| shr p_src_row0, 15 |
| imul p_src_row0, i_src_stride |
| add p_src_row0, p_src |
| mov p_src_row1, p_src_row0 |
| add p_src_row1, i_src_stride |
| movd xmm_tmp1, i_yposd |
| %if %2 |
| pshuflw xmm_tmp1, xmm_tmp1, 0 |
| psllw xmm_tmp1, 1 |
| psrlw xmm_tmp1, 1 |
| %else |
| pslld xmm_tmp1, 17 |
| psrld xmm_tmp1, 17 |
| %endif |
| %ifdef X86_32 |
| pshufd xmm_tmp1, xmm_tmp1, 0 |
| pcmpeqw xmm_tmp0, xmm_tmp0 |
| %if %2 |
| psrlw xmm_tmp0, 1 |
| %else |
| psrld xmm_tmp0, 17 |
| %endif |
| pxor xmm_tmp0, xmm_tmp1 |
| movdqa xmm_yfrac0, xmm_tmp0 |
| movdqa xmm_yfrac1, xmm_tmp1 |
| %else |
| pshufd xmm_yfrac1, xmm_tmp1, 0 |
| pcmpeqw xmm_yfrac0, xmm_yfrac0 |
| %if %2 |
| psrlw xmm_yfrac0, 1 |
| %else |
| psrld xmm_yfrac0, 17 |
| %endif |
| pxor xmm_yfrac0, xmm_yfrac1 |
| %endif |
| |
| mov i_xpos, 1 << 15 |
| mov i_width_cnt, i_dst_width |
| sub i_width_cnt, 1 |
| |
| %ifdef xmm_xpos_int |
| movdqa xmm_xpos_int, xmm_xpos_int_begin |
| movdqa xmm_xpos_frac, xmm_xpos_frac_begin |
| %else |
| movdqa xmm_xfrac0, xmm_xfrac0_begin |
| movdqa xmm_xfrac1, xmm_xfrac1_begin |
| %endif |
| |
| %%width: |
| %1 |
| sub i_width_cnt, 8 |
| jg %%width |
| |
| lea p_dst, [p_dst + i_width_cnt + 1] |
| imul i_width_cnt, i_scalex |
| add i_xpos, i_width_cnt |
| shr i_xpos, 16 |
| movzx r_tmp0, byte [p_src_row0 + i_xpos] |
| mov [p_dst - 1], r_tmp0b |
| %ifdef X86_32 |
| mov r_tmp0, i_scaleyd |
| add i_yposd, r_tmp0 |
| %else |
| add i_yposd, i_scaleyd |
| %endif |
| add p_dst, i_dst_stride_less_width |
| sub i_dst_height, 1 |
| jg %%height |
| %endmacro |
| |
| ;************************************************************************************************************** |
| ;void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, |
| ; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, |
| ; uint32_t uiScaleY); |
| ; |
| ;************************************************************************************************************** |
| |
| WELS_EXTERN GeneralBilinearFastDownsampler_ssse3 |
| %assign push_num 0 |
| %ifndef X86_32 |
| push r12 |
| push r13 |
| push rbx |
| push rbp |
| %assign push_num 4 |
| %ifdef WIN64 |
| push rdi |
| push rsi |
| %assign push_num push_num + 2 |
| %endif |
| %endif |
| LOAD_7_PARA |
| PUSH_XMM 16 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r5, r5d |
| ZERO_EXTENSION r6d |
| sub r1, r2 ; dst_stride - dst_width |
| %ifdef X86_32 |
| movd xmm0, arg8 |
| movd xmm1, esp |
| and esp, -16 |
| %ifdef X86_32_PICASM |
| sub esp, 8 * 4 + 9 * 16 |
| %else |
| sub esp, 8 * 4 + 7 * 16 |
| %endif |
| movd [esp], xmm1 |
| %define p_dst r0 |
| %define i_dst_stride_less_width [esp + 1 * 4] |
| %define i_dst_width [esp + 2 * 4] |
| %define i_dst_height dword [esp + 3 * 4] |
| %define p_src [esp + 4 * 4] |
| %define i_src_stride [esp + 5 * 4] |
| %define i_scalex r6 |
| %define i_scalexd r6d |
| %define i_scaleyd [esp + 6 * 4] |
| %define i_xpos r2 |
| %define i_ypos dword [esp + 7 * 4] |
| %define i_yposd dword [esp + 7 * 4] |
| %define p_src_row0 r3 |
| %define p_src_row1 r4 |
| %define i_width_cnt r5 |
| %define r_tmp0 r1 |
| %define r_tmp0b r1b |
| %define xmm_xpos_frac xmm1 |
| %define xmm_xpos_frac_inc [esp + 8 * 4] |
| %define xmm_xpos_int xmm3 |
| %define xmm_xpos_int_inc [esp + 8 * 4 + 1 * 16] |
| %define xmm_yfrac0 [esp + 8 * 4 + 2 * 16] |
| %define xmm_yfrac1 [esp + 8 * 4 + 3 * 16] |
| %define xmm_tmp0 xmm7 |
| %define xmm_tmp1 xmm0 |
| %define xmm_tmp2 xmm2 |
| %define xmm_tmp3 xmm4 |
| %define xmm_tmp4 xmm5 |
| %define xmm_tmp5 xmm6 |
| %define xmm_0 [esp + 8 * 4 + 4 * 16] |
| %define xmm_xpos_int_begin [esp + 8 * 4 + 5 * 16] |
| %define xmm_xpos_frac_begin [esp + 8 * 4 + 6 * 16] |
| %ifdef X86_32_PICASM |
| %define xmm_db80h [esp + 8 * 4 + 7 * 16] |
| %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 16] |
| pxor xmm_tmp4, xmm_tmp4 |
| pcmpeqb xmm_tmp5, xmm_tmp5 |
| psubb xmm_tmp4, xmm_tmp5 |
| movdqa xmm_tmp3, xmm_tmp4 |
| psllw xmm_tmp3, 3 |
| pslldq xmm_tmp3, 8 |
| movdqa xmm_shufb_0000000088888888, xmm_tmp3 |
| psllw xmm_tmp4, 7 |
| movdqa xmm_db80h, xmm_tmp4 |
| %else |
| %define xmm_db80h [db80h_256] |
| %define xmm_shufb_0000000088888888 [shufb_0000000088888888] |
| %endif |
| mov i_dst_stride_less_width, r1 |
| mov i_dst_width, r2 |
| mov i_dst_height, r3 |
| mov p_src, r4 |
| mov i_src_stride, r5 |
| movd i_scaleyd, xmm0 |
| pxor xmm_tmp0, xmm_tmp0 |
| movdqa xmm_0, xmm_tmp0 |
| %else |
| %define p_dst r0 |
| %define i_dst_stride_less_width r1 |
| %define i_dst_width r2 |
| %define i_dst_height r3 |
| %define p_src r4 |
| %define i_src_stride r5 |
| %define i_scalex r6 |
| %define i_scalexd r6d |
| %define i_scaleyd dword arg8d |
| %define i_xpos r12 |
| %define i_ypos r13 |
| %define i_yposd r13d |
| %define p_src_row0 rbp |
| %ifdef WIN64 |
| %define p_src_row1 rsi |
| %define i_width_cnt rdi |
| %else |
| %define p_src_row1 r11 |
| %define i_width_cnt rax |
| %endif |
| %define r_tmp0 rbx |
| %define r_tmp0b bl |
| %define xmm_0 xmm0 |
| %define xmm_xpos_frac xmm1 |
| %define xmm_xpos_frac_inc xmm8 |
| %define xmm_xpos_int xmm3 |
| %define xmm_xpos_int_inc xmm10 |
| %define xmm_yfrac0 xmm11 |
| %define xmm_yfrac1 xmm12 |
| %define xmm_tmp0 xmm7 |
| %define xmm_tmp1 xmm2 |
| %define xmm_tmp2 xmm9 |
| %define xmm_tmp3 xmm4 |
| %define xmm_tmp4 xmm5 |
| %define xmm_tmp5 xmm6 |
| %define xmm_xpos_int_begin xmm14 |
| %define xmm_xpos_frac_begin xmm15 |
| %define xmm_db80h [db80h_256] |
| %define xmm_shufb_0000000088888888 [shufb_0000000088888888] |
| pxor xmm_0, xmm_0 |
| %endif |
| |
| sub i_dst_height, 1 |
| je .final_row |
| jl .done |
| |
| mov i_ypos, 1 << 14 |
| movd xmm_xpos_frac, i_scalexd |
| pshufd xmm_xpos_frac, xmm_xpos_frac, 0 |
| movdqa xmm_tmp0, xmm_xpos_frac |
| pslld xmm_tmp0, 2 |
| pslldq xmm_xpos_frac, 4 |
| paddd xmm_tmp0, xmm_xpos_frac |
| movdqa xmm_tmp1, xmm_xpos_frac |
| pslldq xmm_tmp1, 4 |
| paddd xmm_xpos_frac, xmm_tmp1 |
| paddd xmm_tmp0, xmm_tmp1 |
| pslldq xmm_tmp1, 4 |
| paddd xmm_xpos_frac, xmm_tmp1 |
| paddd xmm_tmp0, xmm_tmp1 |
| pcmpeqw xmm_tmp1, xmm_tmp1 |
| psrld xmm_tmp1, 31 |
| pslld xmm_tmp1, 15 |
| paddd xmm_xpos_frac, xmm_tmp1 |
| paddd xmm_tmp0, xmm_tmp1 |
| movdqa xmm_xpos_int, xmm_xpos_frac |
| movdqa xmm_tmp1, xmm_tmp0 |
| psrld xmm_xpos_int, 16 |
| psrld xmm_tmp1, 16 |
| packssdw xmm_xpos_int, xmm_tmp1 |
| packuswb xmm_xpos_int, xmm_xpos_int |
| movdqa xmm_tmp1, xmm_xpos_int |
| pcmpeqw xmm_tmp2, xmm_tmp2 |
| psubb xmm_tmp1, xmm_tmp2 |
| punpcklbw xmm_xpos_int, xmm_tmp1 |
| pslld xmm_xpos_frac, 16 |
| pslld xmm_tmp0, 16 |
| psrad xmm_xpos_frac, 16 |
| psrad xmm_tmp0, 16 |
| packssdw xmm_xpos_frac, xmm_tmp0 |
| movd xmm_tmp0, i_scalexd |
| pslld xmm_tmp0, 3 |
| movdqa xmm_tmp1, xmm_tmp0 |
| punpcklwd xmm_tmp0, xmm_tmp0 |
| pshufd xmm_tmp0, xmm_tmp0, 0 |
| movdqa xmm_xpos_frac_inc, xmm_tmp0 |
| psrld xmm_tmp1, 16 |
| psubw xmm_tmp1, xmm_tmp2 |
| pxor xmm_tmp2, xmm_tmp2 |
| pshufb xmm_tmp1, xmm_tmp2 |
| movdqa xmm_xpos_int_inc, xmm_tmp1 |
| movdqa xmm_xpos_int_begin, xmm_xpos_int |
| movdqa xmm_xpos_frac_begin, xmm_xpos_frac |
| |
| cmp i_scalex, 4 << 16 |
| ja .scalex_above4 |
| cmp i_scalex, 2 << 16 |
| ja .scalex_above2_beloweq4 |
| SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample2xOrLess_8px, 1 |
| jmp .final_row |
| %ifdef X86_32 |
| %undef xmm_yfrac0 |
| %xdefine xmm_yfrac0 xmm_tmp5 |
| %undef xmm_tmp5 |
| %endif |
| .scalex_above2_beloweq4: |
| SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample4xOrLess_8px, 1 |
| jmp .final_row |
| .scalex_above4: |
| %xdefine xmm_xfrac0 xmm_xpos_frac |
| %xdefine xmm_xfrac1 xmm_xpos_int |
| %xdefine xmm_xfrac0_begin xmm_xpos_int_begin |
| %xdefine xmm_xfrac1_begin xmm_xpos_frac_begin |
| %xdefine xmm_xfrac_inc xmm_xpos_frac_inc |
| %undef xmm_xpos_int |
| %undef xmm_xpos_frac |
| %undef xmm_xpos_int_begin |
| %undef xmm_xpos_frac_begin |
| %undef xmm_xpos_int_inc |
| %undef xmm_xpos_frac_inc |
| SSE2_UnpckXFracuw xmm_tmp0, xmm_xfrac1, xmm_xfrac0 |
| movdqa xmm_xfrac0, xmm_tmp0 |
| movdqa xmm_xfrac0_begin, xmm_xfrac0 |
| movdqa xmm_xfrac1_begin, xmm_xfrac1 |
| pcmpeqw xmm_tmp0, xmm_tmp0 |
| pmullw xmm_tmp0, xmm_xfrac_inc |
| punpcklwd xmm_tmp0, xmm_xfrac_inc |
| movdqa xmm_xfrac_inc, xmm_tmp0 |
| SSE2_GeneralBilinearDownsampler_loop SSE2_GeneralBilinearFastDownsample_8px, 1 |
| |
| .final_row: |
| mov p_src_row0, i_ypos |
| shr p_src_row0, 15 |
| imul p_src_row0, i_src_stride |
| add p_src_row0, p_src |
| mov i_xpos, 1 << 15 |
| mov i_width_cnt, i_dst_width |
| |
| .final_row_width: |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| movzx r_tmp0, byte [p_src_row0 + r_tmp0] |
| mov [p_dst], r_tmp0b |
| add p_dst, 1 |
| add i_xpos, i_scalex |
| sub i_width_cnt, 1 |
| jg .final_row_width |
| |
| .done: |
| %ifdef X86_32 |
| mov esp, [esp] |
| %endif |
| POP_XMM |
| LOAD_7_PARA_POP |
| %ifndef X86_32 |
| %ifdef WIN64 |
| pop rsi |
| pop rdi |
| %endif |
| pop rbp |
| pop rbx |
| pop r13 |
| pop r12 |
| %endif |
| ret |
| %undef p_dst |
| %undef i_dst_stride_less_width |
| %undef i_dst_width |
| %undef i_dst_height |
| %undef p_src |
| %undef i_src_stride |
| %undef i_scalex |
| %undef i_scalexd |
| %undef i_scaleyd |
| %undef i_xpos |
| %undef i_ypos |
| %undef i_yposd |
| %undef p_src_row0 |
| %undef p_src_row1 |
| %undef i_width_cnt |
| %undef r_tmp0 |
| %undef r_tmp0b |
| %undef xmm_0 |
| %undef xmm_xpos_frac |
| %undef xmm_xpos_frac_inc |
| %undef xmm_xpos_int |
| %undef xmm_xpos_int_inc |
| %undef xmm_yfrac0 |
| %undef xmm_yfrac1 |
| %undef xmm_tmp0 |
| %undef xmm_tmp1 |
| %undef xmm_tmp2 |
| %undef xmm_tmp3 |
| %undef xmm_tmp4 |
| %undef xmm_tmp5 |
| %undef xmm_xpos_int_begin |
| %undef xmm_xpos_frac_begin |
| %undef xmm_xfrac0 |
| %undef xmm_xfrac1 |
| %undef xmm_xfrac0_begin |
| %undef xmm_xfrac1_begin |
| %undef xmm_xfrac_inc |
| %undef xmm_db80h |
| %undef xmm_shufb_0000000088888888 |
| |
| ;************************************************************************************************************** |
| ;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, |
| ; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, |
| ; uint32_t uiScaleY); |
| ; |
| ;************************************************************************************************************** |
| |
| WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41 |
| %assign push_num 0 |
| %ifndef X86_32 |
| push r12 |
| push r13 |
| push rbx |
| push rbp |
| %assign push_num 4 |
| %ifdef WIN64 |
| push rdi |
| push rsi |
| %assign push_num push_num + 2 |
| %endif |
| %endif |
| LOAD_7_PARA |
| PUSH_XMM 16 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r5, r5d |
| ZERO_EXTENSION r6d |
| sub r1, r2 ; dst_stride - dst_width |
| add r6, r6 ; 2 * scalex |
| %ifdef X86_32 |
| movd xmm0, arg8 |
| movd xmm1, esp |
| and esp, -16 |
| %ifdef X86_32_PICASM |
| sub esp, 8 * 4 + 10 * 16 |
| %else |
| sub esp, 8 * 4 + 8 * 16 |
| %endif |
| movd [esp], xmm1 |
| %define p_dst r0 |
| %define i_dst_stride_less_width [esp + 1 * 4] |
| %define i_dst_width [esp + 2 * 4] |
| %define i_dst_height dword [esp + 3 * 4] |
| %define p_src [esp + 4 * 4] |
| %define i_src_stride [esp + 5 * 4] |
| %define i_scalex r6 |
| %define i_scalexd r6d |
| %define i_scaleyd [esp + 6 * 4] |
| %define i_xpos r2 |
| %define i_ypos dword [esp + 7 * 4] |
| %define i_yposd dword [esp + 7 * 4] |
| %define p_src_row0 r3 |
| %define p_src_row1 r4 |
| %define i_width_cnt r5 |
| %define r_tmp0 r1 |
| %define r_tmp0b r1b |
| %define xmm_xpos_frac xmm1 |
| %define xmm_xpos_frac_inc [esp + 8 * 4] |
| %define xmm_xpos_int xmm3 |
| %define xmm_xpos_int_inc [esp + 8 * 4 + 1 * 16] |
| %define xmm_yfrac0 [esp + 8 * 4 + 2 * 16] |
| %define xmm_yfrac1 [esp + 8 * 4 + 3 * 16] |
| %define xmm_tmp0 xmm7 |
| %define xmm_tmp1 xmm0 |
| %define xmm_tmp2 xmm2 |
| %define xmm_tmp3 xmm4 |
| %define xmm_tmp4 xmm5 |
| %define xmm_tmp5 xmm6 |
| %define xmm_0 [esp + 8 * 4 + 4 * 16] |
| %define xmm_7fff [esp + 8 * 4 + 5 * 16] |
| %define xmm_xpos_int_begin [esp + 8 * 4 + 6 * 16] |
| %define xmm_xpos_frac_begin [esp + 8 * 4 + 7 * 16] |
| %ifdef X86_32_PICASM |
| %define xmm_db80h [esp + 8 * 4 + 8 * 16] |
| %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 16] |
| pxor xmm_tmp4, xmm_tmp4 |
| pcmpeqb xmm_tmp5, xmm_tmp5 |
| psubb xmm_tmp4, xmm_tmp5 |
| movdqa xmm_tmp3, xmm_tmp4 |
| psllw xmm_tmp3, 3 |
| pslldq xmm_tmp3, 8 |
| movdqa xmm_shufb_0000000088888888, xmm_tmp3 |
| psllw xmm_tmp4, 7 |
| movdqa xmm_db80h, xmm_tmp4 |
| %else |
| %define xmm_db80h [db80h_256] |
| %define xmm_shufb_0000000088888888 [shufb_0000000088888888] |
| %endif |
| mov i_dst_stride_less_width, r1 |
| mov i_dst_width, r2 |
| mov i_dst_height, r3 |
| mov p_src, r4 |
| mov i_src_stride, r5 |
| movd i_scaleyd, xmm0 |
| pxor xmm_tmp5, xmm_tmp5 |
| movdqa xmm_0, xmm_tmp5 |
| pcmpeqw xmm_tmp5, xmm_tmp5 |
| psrlw xmm_tmp5, 1 |
| movdqa xmm_7fff, xmm_tmp5 |
| %else |
| %define p_dst r0 |
| %define i_dst_stride_less_width r1 |
| %define i_dst_width r2 |
| %define i_dst_height r3 |
| %define p_src r4 |
| %define i_src_stride r5 |
| %define i_scalex r6 |
| %define i_scalexd r6d |
| %define i_scaleyd dword arg8d |
| %define i_xpos r12 |
| %define i_ypos r13 |
| %define i_yposd r13d |
| %define p_src_row0 rbp |
| %ifdef WIN64 |
| %define p_src_row1 rsi |
| %define i_width_cnt rdi |
| %else |
| %define p_src_row1 r11 |
| %define i_width_cnt rax |
| %endif |
| %define r_tmp0 rbx |
| %define r_tmp0b bl |
| %define xmm_0 xmm0 |
| %define xmm_xpos_frac xmm1 |
| %define xmm_xpos_frac_inc xmm8 |
| %define xmm_xpos_int xmm3 |
| %define xmm_xpos_int_inc xmm10 |
| %define xmm_yfrac0 xmm11 |
| %define xmm_yfrac1 xmm12 |
| %define xmm_tmp0 xmm7 |
| %define xmm_tmp1 xmm2 |
| %define xmm_tmp2 xmm9 |
| %define xmm_tmp3 xmm4 |
| %define xmm_tmp4 xmm5 |
| %define xmm_tmp5 xmm6 |
| %define xmm_7fff xmm13 |
| %define xmm_xpos_int_begin xmm14 |
| %define xmm_xpos_frac_begin xmm15 |
| %define xmm_db80h [db80h_256] |
| %define xmm_shufb_0000000088888888 [shufb_0000000088888888] |
| pxor xmm_0, xmm_0 |
| pcmpeqw xmm_7fff, xmm_7fff |
| psrlw xmm_7fff, 1 |
| %endif |
| |
| sub i_dst_height, 1 |
| je .final_row |
| jl .done |
| |
| mov i_ypos, 1 << 14 |
| movd xmm_xpos_frac, i_scalexd |
| pshufd xmm_xpos_frac, xmm_xpos_frac, 0 |
| movdqa xmm_tmp0, xmm_xpos_frac |
| pslld xmm_tmp0, 2 |
| pslldq xmm_xpos_frac, 4 |
| paddd xmm_tmp0, xmm_xpos_frac |
| movdqa xmm_tmp1, xmm_xpos_frac |
| pslldq xmm_tmp1, 4 |
| paddd xmm_xpos_frac, xmm_tmp1 |
| paddd xmm_tmp0, xmm_tmp1 |
| pslldq xmm_tmp1, 4 |
| paddd xmm_xpos_frac, xmm_tmp1 |
| paddd xmm_tmp0, xmm_tmp1 |
| pcmpeqw xmm_tmp1, xmm_tmp1 |
| psrld xmm_tmp1, 31 |
| pslld xmm_tmp1, 15 |
| paddd xmm_xpos_frac, xmm_tmp1 |
| paddd xmm_tmp0, xmm_tmp1 |
| movdqa xmm_xpos_int, xmm_xpos_frac |
| movdqa xmm_tmp1, xmm_tmp0 |
| psrld xmm_xpos_int, 16 |
| psrld xmm_tmp1, 16 |
| packssdw xmm_xpos_int, xmm_tmp1 |
| packuswb xmm_xpos_int, xmm_xpos_int |
| movdqa xmm_tmp1, xmm_xpos_int |
| pcmpeqw xmm_tmp2, xmm_tmp2 |
| psubb xmm_tmp1, xmm_tmp2 |
| punpcklbw xmm_xpos_int, xmm_tmp1 |
| pslld xmm_xpos_frac, 16 |
| pslld xmm_tmp0, 16 |
| psrad xmm_xpos_frac, 16 |
| psrad xmm_tmp0, 16 |
| packssdw xmm_xpos_frac, xmm_tmp0 |
| psrlw xmm_xpos_frac, 1 |
| movd xmm_tmp0, i_scalexd |
| pslld xmm_tmp0, 3 |
| movdqa xmm_tmp1, xmm_tmp0 |
| punpcklwd xmm_tmp0, xmm_tmp0 |
| pshufd xmm_tmp0, xmm_tmp0, 0 |
| psrlw xmm_tmp0, 1 |
| movdqa xmm_xpos_frac_inc, xmm_tmp0 |
| psrld xmm_tmp1, 16 |
| pxor xmm_tmp2, xmm_tmp2 |
| pshufb xmm_tmp1, xmm_tmp2 |
| movdqa xmm_xpos_int_inc, xmm_tmp1 |
| movdqa xmm_xpos_int_begin, xmm_xpos_int |
| movdqa xmm_xpos_frac_begin, xmm_xpos_frac |
| |
| cmp i_scalex, 4 << 16 |
| ja .scalex_above4 |
| cmp i_scalex, 2 << 16 |
| ja .scalex_above2_beloweq4 |
| SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample2xOrLess_8px, 0 |
| jmp .final_row |
| %ifdef X86_32 |
| %undef xmm_yfrac0 |
| %xdefine xmm_yfrac0 xmm_tmp5 |
| %undef xmm_tmp5 |
| %endif |
| .scalex_above2_beloweq4: |
| SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample4xOrLess_8px, 0 |
| jmp .final_row |
| .scalex_above4: |
| %xdefine xmm_xfrac0 xmm_xpos_frac |
| %xdefine xmm_xfrac1 xmm_xpos_int |
| %xdefine xmm_xfrac0_begin xmm_xpos_int_begin |
| %xdefine xmm_xfrac1_begin xmm_xpos_frac_begin |
| %xdefine xmm_xfrac_inc xmm_xpos_frac_inc |
| %undef xmm_xpos_int |
| %undef xmm_xpos_frac |
| %undef xmm_xpos_int_begin |
| %undef xmm_xpos_frac_begin |
| %undef xmm_xpos_int_inc |
| %undef xmm_xpos_frac_inc |
| SSE2_UnpckXFracw xmm_tmp0, xmm_xfrac1, xmm_xfrac0, xmm_7fff |
| movdqa xmm_xfrac0, xmm_tmp0 |
| movdqa xmm_xfrac0_begin, xmm_xfrac0 |
| movdqa xmm_xfrac1_begin, xmm_xfrac1 |
| pcmpeqw xmm_tmp0, xmm_tmp0 |
| pmullw xmm_tmp0, xmm_xfrac_inc |
| punpcklwd xmm_tmp0, xmm_xfrac_inc |
| movdqa xmm_xfrac_inc, xmm_tmp0 |
| SSE2_GeneralBilinearDownsampler_loop SSE41_GeneralBilinearAccurateDownsample_8px, 0 |
| |
| .final_row: |
| mov p_src_row0, i_ypos |
| shr p_src_row0, 15 |
| imul p_src_row0, i_src_stride |
| add p_src_row0, p_src |
| mov i_xpos, 1 << 15 |
| mov i_width_cnt, i_dst_width |
| |
| .final_row_width: |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| movzx r_tmp0, byte [p_src_row0 + r_tmp0] |
| mov [p_dst], r_tmp0b |
| add p_dst, 1 |
| add i_xpos, i_scalex |
| sub i_width_cnt, 1 |
| jg .final_row_width |
| |
| .done: |
| %ifdef X86_32 |
| mov esp, [esp] |
| %endif |
| POP_XMM |
| LOAD_7_PARA_POP |
| %ifndef X86_32 |
| %ifdef WIN64 |
| pop rsi |
| pop rdi |
| %endif |
| pop rbp |
| pop rbx |
| pop r13 |
| pop r12 |
| %endif |
| ret |
| %undef p_dst |
| %undef i_dst_stride_less_width |
| %undef i_dst_width |
| %undef i_dst_height |
| %undef p_src |
| %undef i_src_stride |
| %undef i_scalex |
| %undef i_scalexd |
| %undef i_scaleyd |
| %undef i_xpos |
| %undef i_ypos |
| %undef i_yposd |
| %undef p_src_row0 |
| %undef p_src_row1 |
| %undef i_width_cnt |
| %undef r_tmp0 |
| %undef r_tmp0b |
| %undef xmm_0 |
| %undef xmm_xpos_frac |
| %undef xmm_xpos_frac_inc |
| %undef xmm_xpos_int |
| %undef xmm_xpos_int_inc |
| %undef xmm_yfrac0 |
| %undef xmm_yfrac1 |
| %undef xmm_tmp0 |
| %undef xmm_tmp1 |
| %undef xmm_tmp2 |
| %undef xmm_tmp3 |
| %undef xmm_tmp4 |
| %undef xmm_tmp5 |
| %undef xmm_7fff |
| %undef xmm_xpos_int_begin |
| %undef xmm_xpos_frac_begin |
| %undef xmm_xfrac0 |
| %undef xmm_xfrac1 |
| %undef xmm_xfrac0_begin |
| %undef xmm_xfrac1_begin |
| %undef xmm_xfrac_inc |
| %undef xmm_db80h |
| %undef xmm_shufb_0000000088888888 |
| |
| %ifdef HAVE_AVX2 |
| ; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5 |
| %macro AVX2_BilinearIncXposuw 5 |
| vpaddusw %5, %2, %4 |
| vpaddw %2, %2, %4 |
| vpcmpeqw %5, %5, %2 |
| vpaddb %1, %1, %3 |
| vpaddb %1, %1, %5 ; subtract 1 if no carry |
| %endmacro |
| |
| ; outl=%1 outh=%2 in=%3 FFFFh/7FFFh=%4 |
| %macro AVX2_UnpckXFrac 4 |
| vpxor %1, %3, %4 |
| vpunpckhwd %2, %1, %3 |
| vpunpcklwd %1, %1, %3 |
| %endmacro |
| |
| ; out0=%1 out1=%2 xfrac=%3 yfrac0=%4 yfrac1=%5 |
| %macro AVX2_BilinearFastCalcXYFrac 5 |
| vpmulhuw %2, %3, %5 |
| vpmulhuw %1, %3, %4 |
| %endmacro |
| |
| ; [in:dwordsl out:bytes] dwordsh=%2 zero=%3 |
| %macro AVX2_BilinearFastPackDwordsToBytes 3 |
| vpsrld %1, %1, 14 |
| vpsrld %2, %2, 14 |
| vpackssdw %1, %1, %2 |
| vpavgw %1, %1, %3 |
| vpackuswb %1, %1, %1 |
| %endmacro |
| |
| %macro AVX2_BilinearFastDownsample2xOrLess_16px 0 |
| vpshufb ymm_tmp0, ymm_xpos_int, ymm_0 |
| vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 |
| AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex2] |
| lea i_xpos, [i_xpos + 8 * i_scalex2] |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 |
| vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int |
| vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int |
| AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1 |
| vpunpcklbw ymm_tmp3, ymm_tmp4, ymm_0 |
| vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp3 |
| vpunpcklbw ymm_tmp3, ymm_tmp5, ymm_0 |
| vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3 |
| vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2 |
| AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1 |
| vpunpckhbw ymm_tmp2, ymm_tmp4, ymm_0 |
| vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp2 |
| vpunpckhbw ymm_tmp2, ymm_tmp5, ymm_0 |
| vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp2 |
| vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3 |
| AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0 |
| vmovlps [p_dst], xmm_tmp0 |
| vextracti128 [p_dst + 8], ymm_tmp0, 1 |
| add p_dst, 16 |
| AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0 |
| %endmacro |
| |
| %macro AVX2_BilinearFastDownsample4xOrLess_16px 0 |
| vbroadcasti128 ymm_tmp0, xmm_shufb_0000000088888888 |
| vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0 |
| vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 |
| AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex2] |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1 |
| lea r_tmp0, [i_xpos + 2 * i_scalex2] |
| lea i_xpos, [r_tmp0 + 4 * i_scalex2] |
| shr r_tmp0, 16 |
| vpunpcklbw ymm_tmp2, ymm_xpos_int, ymm_ffff |
| vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp2 |
| vpshufb ymm_tmp3, ymm_tmp3, ymm_tmp2 |
| AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1 |
| vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4 |
| vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3 |
| vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2 |
| vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0] |
| mov r_tmp0, i_xpos |
| lea i_xpos, [i_xpos + 2 * i_scalex2] |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1 |
| vpunpckhbw ymm_tmp2, ymm_xpos_int, ymm_ffff |
| vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp2 |
| vpshufb ymm_tmp3, ymm_tmp3, ymm_tmp2 |
| AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_tmp1, ymm_yfrac0, ymm_yfrac1 |
| vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4 |
| vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3 |
| vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp2 |
| AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0 |
| vmovlps [p_dst], xmm_tmp0 |
| vextracti128 [p_dst + 8], ymm_tmp0, 1 |
| add p_dst, 16 |
| AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0 |
| %endmacro |
| |
| %macro AVX2_BilinearFastDownsample8xOrLess_16px 0 |
| vbroadcasti128 ymm_tmp0, xmm_shufb_000044448888CCCC |
| vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0 |
| vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex2] |
| add i_xpos, i_scalex2 |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp0, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp1, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex2] |
| add i_xpos, i_scalex2 |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1 |
| vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_ffff |
| vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3 |
| vpshufb ymm_tmp5, ymm_tmp5, ymm_tmp3 |
| vpshufb ymm_tmp0, ymm_tmp0, ymm_tmp3 |
| vpshufb ymm_tmp1, ymm_tmp1, ymm_tmp3 |
| vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b |
| vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b |
| AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff |
| AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1 |
| vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4 |
| vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp5 |
| vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex2] |
| add i_xpos, i_scalex2 |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 |
| mov r_tmp0, i_xpos |
| lea i_xpos, [i_xpos + 4 * i_scalex2] |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp2, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0] |
| mov r_tmp0, i_xpos |
| add i_xpos, i_scalex2 |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1 |
| vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int |
| vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int |
| vpshufb ymm_tmp2, ymm_tmp2, ymm_xpos_int |
| vpshufb ymm_tmp3, ymm_tmp3, ymm_xpos_int |
| vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b |
| vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b |
| vpunpckhbw ymm_tmp4, ymm_tmp4, ymm_0 |
| vpunpckhbw ymm_tmp5, ymm_tmp5, ymm_0 |
| AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1 |
| vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4 |
| vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp5 |
| vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3 |
| AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0 |
| vmovlps [p_dst], xmm_tmp0 |
| vextracti128 [p_dst + 8], ymm_tmp0, 1 |
| add p_dst, 16 |
| AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0 |
| %endmacro |
| |
| %macro AVX2_GeneralBilinearFastDownsample_16px 0 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp4, [p_src_row0 + r_tmp0] |
| vpbroadcastd ymm_tmp5, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 1 * i_scalex] |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] |
| vpunpcklwd ymm_tmp4, ymm_tmp4, ymm_tmp0 |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] |
| vpunpcklwd ymm_tmp5, ymm_tmp5, ymm_tmp0 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] |
| vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] |
| vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2] |
| vpblendw ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2] |
| vpblendw ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp2, [p_src_row0 + r_tmp0] |
| vpbroadcastd ymm_tmp3, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 1 * i_scalex] |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] |
| vpunpcklwd ymm_tmp2, ymm_tmp2, ymm_tmp0 |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] |
| vpunpcklwd ymm_tmp3, ymm_tmp3, ymm_tmp0 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] |
| vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] |
| vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2] |
| vpblendw ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2] |
| vpblendw ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovd xmm_tmp0, [p_src_row0 + r_tmp0] |
| vmovd xmm_tmp1, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + i_scalex] |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2 |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3 |
| vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b |
| vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovd xmm_tmp0, [p_src_row0 + r_tmp0] |
| vmovd xmm_tmp1, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + i_scalex] |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2 |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3 |
| vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b |
| vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b |
| vpunpcklbw ymm_tmp4, ymm_tmp4, ymm_0 |
| vpunpcklbw ymm_tmp5, ymm_tmp5, ymm_0 |
| AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp1, ymm_xfrac0, ymm_yfrac0, ymm_yfrac1 |
| vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4 |
| vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp5 |
| vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp1 |
| vpunpcklbw ymm_tmp4, ymm_tmp2, ymm_0 |
| vpunpcklbw ymm_tmp5, ymm_tmp3, ymm_0 |
| AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_xfrac1, ymm_yfrac0, ymm_yfrac1 |
| vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4 |
| vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp5 |
| vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp2 |
| AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0 |
| vpermq ymm_tmp0, ymm_tmp0, 0010b |
| vmovdqu [p_dst], xmm_tmp0 |
| add p_dst, 16 |
| vpaddw ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc |
| vpaddw ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc |
| %endmacro |
| |
| ; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6,%7 |
| %macro AVX2_BilinearIncXposw 7 |
| vpaddb %1, %1, %3 |
| vpaddw %6, %2, %4 |
| vpcmpgtw %7, %2, %6 |
| vpsubb %1, %1, %7 ; add carry |
| vpand %2, %6, %5 |
| %endmacro |
| |
| ; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6 |
| %macro AVX2_LinearAccurateInterpolateVerticalDwords 6 |
| vpshufd %1, %2, 10110001b |
| vpshufd %6, %3, 10110001b |
| vpmuludq %1, %1, %4 |
| vpmuludq %6, %6, %5 |
| vpaddq %1, %1, %6 |
| vpmuludq %2, %2, %4 |
| vpmuludq %3, %3, %5 |
| vpaddq %2, %2, %3 |
| vpsllq %1, %1, 3 |
| vpsrlq %2, %2, 29 |
| vpblendd %1, %1, %2, 01010101b |
| %endmacro |
| |
| %macro AVX2_BilinearAccurateDownsample2xOrLess_16px 0 |
| vpshufb ymm_tmp0, ymm_xpos_int, ymm_0 |
| vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 |
| AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex2] |
| lea i_xpos, [i_xpos + 8 * i_scalex2] |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 |
| vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int |
| vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int |
| vpunpcklbw ymm_tmp2, ymm_tmp4, ymm_0 |
| vpunpcklbw ymm_tmp3, ymm_tmp5, ymm_0 |
| vpunpckhbw ymm_tmp4, ymm_tmp4, ymm_0 |
| vpunpckhbw ymm_tmp5, ymm_tmp5, ymm_0 |
| vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp0 |
| vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp0 |
| vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1 |
| vpmaddwd ymm_tmp5, ymm_tmp5, ymm_tmp1 |
| AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp2, ymm_tmp3, ymm_yfrac0, ymm_yfrac1, ymm_tmp1 |
| AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2 |
| vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1 |
| vpavgw ymm_tmp0, ymm_tmp0, ymm_0 |
| vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0 |
| vmovlps [p_dst], xmm_tmp0 |
| vextracti128 [p_dst + 8], ymm_tmp0, 1 |
| add p_dst, 16 |
| AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1 |
| %endmacro |
| |
| %macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0 |
| vbroadcasti128 ymm_tmp0, xmm_shufb_0000000088888888 |
| vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0 |
| vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 |
| AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp2, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex2] |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1 |
| lea r_tmp0, [i_xpos + 2 * i_scalex2] |
| lea i_xpos, [r_tmp0 + 4 * i_scalex2] |
| shr r_tmp0, 16 |
| vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_db80h |
| vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3 |
| vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3 |
| vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp0 |
| vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp0 |
| AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3 |
| vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp2, [p_src_row1 + r_tmp0] |
| mov r_tmp0, i_xpos |
| lea i_xpos, [i_xpos + 2 * i_scalex2] |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1 |
| vpunpckhbw ymm_tmp3, ymm_xpos_int, ymm_db80h |
| vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3 |
| vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3 |
| vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1 |
| vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp1 |
| AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3 |
| vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1 |
| vpavgw ymm_tmp0, ymm_tmp0, ymm_0 |
| vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0 |
| vmovlps [p_dst], xmm_tmp0 |
| vextracti128 [p_dst + 8], ymm_tmp0, 1 |
| add p_dst, 16 |
| AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1 |
| %endmacro |
| |
| %macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0 |
| vbroadcasti128 ymm_tmp0, xmm_shufb_000044448888CCCC |
| vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0 |
| vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex2] |
| add i_xpos, i_scalex2 |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp0, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp1, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex2] |
| add i_xpos, i_scalex2 |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1 |
| vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_db80h |
| vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3 |
| vpshufb ymm_tmp5, ymm_tmp5, ymm_tmp3 |
| vpshufb ymm_tmp0, ymm_tmp0, ymm_tmp3 |
| vpshufb ymm_tmp1, ymm_tmp1, ymm_tmp3 |
| vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b |
| vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b |
| AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff |
| vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp0 |
| vpmaddwd ymm_tmp5, ymm_tmp5, ymm_tmp0 |
| AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 4 * i_scalex2] |
| add i_xpos, i_scalex2 |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 |
| mov r_tmp0, i_xpos |
| lea i_xpos, [i_xpos + 4 * i_scalex2] |
| shr r_tmp0, 16 |
| vmovdqu xmm_tmp2, [p_src_row0 + r_tmp0] |
| vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0] |
| mov r_tmp0, i_xpos |
| add i_xpos, i_scalex2 |
| shr r_tmp0, 16 |
| vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1 |
| vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1 |
| vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int |
| vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int |
| vpshufb ymm_tmp2, ymm_tmp2, ymm_xpos_int |
| vpshufb ymm_tmp3, ymm_tmp3, ymm_xpos_int |
| vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b |
| vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b |
| vpunpckhbw ymm_tmp4, ymm_tmp4, ymm_0 |
| vpunpckhbw ymm_tmp5, ymm_tmp5, ymm_0 |
| vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1 |
| vpmaddwd ymm_tmp5, ymm_tmp5, ymm_tmp1 |
| AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3 |
| vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1 |
| vpavgw ymm_tmp0, ymm_tmp0, ymm_0 |
| vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0 |
| vmovlps [p_dst], xmm_tmp0 |
| vextracti128 [p_dst + 8], ymm_tmp0, 1 |
| add p_dst, 16 |
| AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1 |
| %endmacro |
| |
| %macro AVX2_GeneralBilinearAccurateDownsample_16px 0 |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp4, [p_src_row0 + r_tmp0] |
| vpbroadcastd ymm_tmp5, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 1 * i_scalex] |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] |
| vpunpcklwd ymm_tmp4, ymm_tmp4, ymm_tmp0 |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] |
| vpunpcklwd ymm_tmp5, ymm_tmp5, ymm_tmp0 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] |
| vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] |
| vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2] |
| vpblendw ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2] |
| vpblendw ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp2, [p_src_row0 + r_tmp0] |
| vpbroadcastd ymm_tmp3, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + 1 * i_scalex] |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] |
| vpunpcklwd ymm_tmp2, ymm_tmp2, ymm_tmp0 |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] |
| vpunpcklwd ymm_tmp3, ymm_tmp3, ymm_tmp0 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] |
| vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] |
| vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2] |
| vpblendw ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b |
| vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2] |
| vpblendw ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovd xmm_tmp0, [p_src_row0 + r_tmp0] |
| vmovd xmm_tmp1, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + i_scalex] |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2 |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3 |
| vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b |
| vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| vmovd xmm_tmp0, [p_src_row0 + r_tmp0] |
| vmovd xmm_tmp1, [p_src_row1 + r_tmp0] |
| lea r_tmp0, [i_xpos + i_scalex] |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1 |
| lea r_tmp0, [i_xpos + 2 * i_scalex] |
| lea i_xpos, [i_xpos + 4 * i_scalex] |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2 |
| mov r_tmp0, i_xpos |
| sub r_tmp0, i_scalex |
| shr r_tmp0, 16 |
| vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3 |
| vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3 |
| vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b |
| vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b |
| vpunpcklbw ymm_tmp4, ymm_tmp4, ymm_0 |
| vpunpcklbw ymm_tmp5, ymm_tmp5, ymm_0 |
| vpmaddwd ymm_tmp4, ymm_tmp4, ymm_xfrac0 |
| vpmaddwd ymm_tmp5, ymm_tmp5, ymm_xfrac0 |
| AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp1 |
| vpunpcklbw ymm_tmp4, ymm_tmp2, ymm_0 |
| vpunpcklbw ymm_tmp5, ymm_tmp3, ymm_0 |
| vpmaddwd ymm_tmp4, ymm_tmp4, ymm_xfrac1 |
| vpmaddwd ymm_tmp5, ymm_tmp5, ymm_xfrac1 |
| AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2 |
| vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1 |
| vpavgw ymm_tmp0, ymm_tmp0, ymm_0 |
| vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0 |
| vextracti128 [p_dst], ymm_tmp0, 1 |
| vmovlps [p_dst + 8], xmm_tmp0 |
| add p_dst, 16 |
| vpaddw ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc |
| vpaddw ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc |
| vpand ymm_xfrac0, ymm_xfrac0, ymm_7fff |
| vpand ymm_xfrac1, ymm_xfrac1, ymm_7fff |
| %endmacro |
| |
| ; downsample_16px_macro=%1 b_fast=%2 |
| %macro AVX2_GeneralBilinearDownsampler_loop 2 |
| %%height: |
| mov p_src_row0, i_ypos |
| shr p_src_row0, 15 |
| imul p_src_row0, i_src_stride |
| add p_src_row0, p_src |
| mov p_src_row1, p_src_row0 |
| add p_src_row1, i_src_stride |
| %ifdef X86_32 |
| %if %2 |
| vpbroadcastw ymm_tmp1, i_ypos |
| vpsllw ymm_tmp1, ymm_tmp1, 1 |
| vpsrlw ymm_tmp1, ymm_tmp1, 1 |
| vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0 |
| vpsrlw ymm_tmp0, ymm_tmp0, 1 |
| %else |
| vpbroadcastd ymm_tmp1, i_ypos |
| vpslld ymm_tmp1, ymm_tmp1, 17 |
| vpsrld ymm_tmp1, ymm_tmp1, 17 |
| vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0 |
| vpsrld ymm_tmp0, ymm_tmp0, 17 |
| %endif |
| vpxor ymm_tmp0, ymm_tmp0, ymm_tmp1 |
| vmovdqa ymm_yfrac0, ymm_tmp0 |
| vmovdqa ymm_yfrac1, ymm_tmp1 |
| %else |
| vmovd xmm_tmp0, i_yposd |
| vpbroadcastw ymm_yfrac1, xmm_tmp0 |
| %if %2 |
| vpsllw ymm_yfrac1, ymm_yfrac1, 1 |
| vpsrlw ymm_yfrac1, ymm_yfrac1, 1 |
| vpcmpeqw ymm_yfrac0, ymm_yfrac0, ymm_yfrac0 |
| vpsrlw ymm_yfrac0, ymm_yfrac0, 1 |
| %else |
| vpslld ymm_yfrac1, ymm_yfrac1, 17 |
| vpsrld ymm_yfrac1, ymm_yfrac1, 17 |
| vpcmpeqw ymm_yfrac0, ymm_yfrac0, ymm_yfrac0 |
| vpsrld ymm_yfrac0, ymm_yfrac0, 17 |
| %endif |
| vpxor ymm_yfrac0, ymm_yfrac0, ymm_yfrac1 |
| %endif |
| |
| mov i_xpos, 1 << 15 |
| mov i_width_cnt, i_dst_width |
| sub i_width_cnt, 1 |
| |
| %ifdef ymm_xpos_int |
| vmovdqa ymm_xpos_int, ymm_xpos_int_begin |
| vmovdqa ymm_xpos_frac, ymm_xpos_frac_begin |
| %else |
| vmovdqa ymm_xfrac0, ymm_xfrac0_begin |
| vmovdqa ymm_xfrac1, ymm_xfrac1_begin |
| %endif |
| |
| %%width: |
| %1 |
| sub i_width_cnt, 16 |
| jg %%width |
| |
| lea p_dst, [p_dst + i_width_cnt + 1] |
| %ifdef i_scalex2 |
| mov r_tmp0, i_scalex2 |
| shr r_tmp0, 1 |
| imul i_width_cnt, r_tmp0 |
| %else |
| imul i_width_cnt, i_scalex |
| %endif |
| add i_xpos, i_width_cnt |
| shr i_xpos, 16 |
| movzx r_tmp0, byte [p_src_row0 + i_xpos] |
| mov [p_dst - 1], r_tmp0b |
| %ifdef X86_32 |
| mov r_tmp0, i_scaleyd |
| add i_yposd, r_tmp0 |
| %else |
| add i_yposd, i_scaleyd |
| %endif |
| add p_dst, i_dst_stride_less_width |
| sub i_dst_height, 1 |
| jg %%height |
| %endmacro |
| |
| ;************************************************************************************************************** |
| ;void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, |
| ; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, |
| ; uint32_t uiScaleY); |
| ; |
| ;************************************************************************************************************** |
| |
| WELS_EXTERN GeneralBilinearFastDownsampler_avx2 |
| %assign push_num 0 |
| %ifndef X86_32 |
| push r12 |
| push r13 |
| push rbx |
| push rbp |
| %assign push_num 4 |
| %ifdef WIN64 |
| push rdi |
| push rsi |
| %assign push_num push_num + 2 |
| %endif |
| %endif |
| LOAD_7_PARA |
| PUSH_XMM 16 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r5, r5d |
| ZERO_EXTENSION r6d |
| sub r1, r2 ; dst_stride - dst_width |
| %ifdef X86_32 |
| vmovd xmm0, arg8 |
| vmovd xmm1, esp |
| and esp, -32 |
| %ifdef X86_32_PICASM |
| sub esp, 8 * 4 + 9 * 32 |
| %else |
| sub esp, 8 * 4 + 8 * 32 |
| %endif |
| vmovd [esp], xmm1 |
| %define p_dst r0 |
| %define i_dst_stride_less_width [esp + 1 * 4] |
| %define i_dst_width [esp + 2 * 4] |
| %define i_dst_height dword [esp + 3 * 4] |
| %define p_src [esp + 4 * 4] |
| %define i_src_stride [esp + 5 * 4] |
| %define i_scalex r6 |
| %define i_scalexd r6d |
| %define i_scaleyd [esp + 6 * 4] |
| %define i_xpos r2 |
| %define i_ypos [esp + 7 * 4] |
| %define i_yposd dword [esp + 7 * 4] |
| %define p_src_row0 r3 |
| %define p_src_row1 r4 |
| %define i_width_cnt r5 |
| %define r_tmp0 r1 |
| %define r_tmp0b r1b |
| %define ymm_xpos_frac ymm1 |
| %define ymm_xpos_frac_inc [esp + 8 * 4] |
| %define ymm_xpos_int ymm3 |
| %define ymm_xpos_int_inc [esp + 8 * 4 + 1 * 32] |
| %define ymm_yfrac0 [esp + 8 * 4 + 2 * 32] |
| %define ymm_yfrac1 [esp + 8 * 4 + 3 * 32] |
| %define xmm_tmp0 xmm7 |
| %define ymm_tmp0 ymm7 |
| %define xmm_tmp1 xmm0 |
| %define ymm_tmp1 ymm0 |
| %define xmm_tmp2 xmm2 |
| %define ymm_tmp2 ymm2 |
| %define xmm_tmp3 xmm4 |
| %define ymm_tmp3 ymm4 |
| %define xmm_tmp4 xmm5 |
| %define ymm_tmp4 ymm5 |
| %define xmm_tmp5 xmm6 |
| %define ymm_tmp5 ymm6 |
| %define ymm_0 [esp + 8 * 4 + 4 * 32] |
| %define ymm_ffff [esp + 8 * 4 + 5 * 32] |
| %define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32] |
| %define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32] |
| %ifdef X86_32_PICASM |
| %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 32] |
| %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 8 * 32 + 16] |
| vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4 |
| vpcmpeqb ymm_tmp5, ymm_tmp5, ymm_tmp5 |
| vpsubb ymm_tmp4, ymm_tmp4, ymm_tmp5 |
| vpsllw ymm_tmp3, ymm_tmp4, 3 |
| vpslldq ymm_tmp3, ymm_tmp3, 8 |
| vmovdqa xmm_shufb_0000000088888888, xmm_tmp3 |
| vpsllq ymm_tmp5, ymm_tmp4, 34 |
| vpaddb ymm_tmp5, ymm_tmp5, ymm_tmp3 |
| vmovdqa xmm_shufb_000044448888CCCC, xmm_tmp5 |
| %else |
| %define xmm_shufb_0000000088888888 [shufb_0000000088888888] |
| %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC] |
| %endif |
| mov i_dst_stride_less_width, r1 |
| mov i_dst_width, r2 |
| mov i_dst_height, r3 |
| mov p_src, r4 |
| mov i_src_stride, r5 |
| vmovd i_scaleyd, xmm0 |
| vpxor xmm0, xmm0, xmm0 |
| vmovdqa ymm_0, ymm0 |
| vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0 |
| vmovdqa ymm_ffff, ymm_tmp0 |
| %else |
| %define p_dst r0 |
| %define i_dst_stride_less_width r1 |
| %define i_dst_width r2 |
| %define i_dst_height r3 |
| %define p_src r4 |
| %define i_src_stride r5 |
| %define i_scalex r6 |
| %define i_scalexd r6d |
| %define i_scaleyd dword arg8d |
| %define i_xpos r12 |
| %define i_ypos r13 |
| %define i_yposd r13d |
| %define p_src_row0 rbp |
| %ifdef WIN64 |
| %define p_src_row1 rsi |
| %define i_width_cnt rdi |
| %else |
| %define p_src_row1 r11 |
| %define i_width_cnt rax |
| %endif |
| %define r_tmp0 rbx |
| %define r_tmp0b bl |
| %define ymm_0 ymm0 |
| %define ymm_xpos_frac ymm1 |
| %define ymm_xpos_frac_inc ymm2 |
| %define ymm_xpos_int ymm3 |
| %define ymm_xpos_int_inc ymm4 |
| %define ymm_yfrac0 ymm5 |
| %define ymm_yfrac1 ymm6 |
| %define xmm_tmp0 xmm7 |
| %define ymm_tmp0 ymm7 |
| %define xmm_tmp1 xmm8 |
| %define ymm_tmp1 ymm8 |
| %define xmm_tmp2 xmm9 |
| %define ymm_tmp2 ymm9 |
| %define xmm_tmp3 xmm10 |
| %define ymm_tmp3 ymm10 |
| %define xmm_tmp4 xmm11 |
| %define ymm_tmp4 ymm11 |
| %define xmm_tmp5 xmm12 |
| %define ymm_tmp5 ymm12 |
| %define ymm_ffff ymm13 |
| %define ymm_xpos_int_begin ymm14 |
| %define ymm_xpos_frac_begin ymm15 |
| %define xmm_shufb_0000000088888888 [shufb_0000000088888888] |
| %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC] |
| vpxor ymm_0, ymm_0, ymm_0 |
| vpcmpeqw ymm_ffff, ymm_ffff, ymm_ffff |
| %endif |
| |
| sub i_dst_height, 1 |
| je .final_row |
| jl .done |
| |
| mov i_yposd, 1 << 14 |
| vmovd xmm_tmp0, i_scalexd |
| vpbroadcastd ymm_tmp0, xmm_tmp0 |
| vpslld ymm_tmp1, ymm_tmp0, 2 |
| vpslld ymm_tmp2, ymm_tmp0, 3 |
| vpaddd ymm_tmp3, ymm_tmp1, ymm_tmp2 |
| vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4 |
| vpblendd ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b |
| vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b |
| vpaddd ymm_tmp3, ymm_tmp0, ymm_tmp0 |
| vpblendd ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b |
| vpblendd ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b |
| vpaddd ymm_tmp0, ymm_tmp3, ymm_tmp0 |
| vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp0 |
| vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp0 |
| vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3 |
| vpsrld ymm_tmp3, ymm_tmp3, 31 |
| vpslld ymm_tmp3, ymm_tmp3, 15 |
| vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3 |
| vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp3 |
| vpsrld ymm_xpos_int, ymm_tmp1, 16 |
| vpsrld ymm_tmp0, ymm_tmp2, 16 |
| vpackssdw ymm_xpos_int, ymm_xpos_int, ymm_tmp0 |
| vpermq ymm_xpos_int, ymm_xpos_int, 11011000b |
| vpackuswb ymm_xpos_int, ymm_xpos_int, ymm_xpos_int |
| vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3 |
| vpsubb ymm_tmp0, ymm_xpos_int, ymm_tmp3 |
| vpunpcklbw ymm_xpos_int, ymm_xpos_int, ymm_tmp0 |
| vpslld ymm_tmp1, ymm_tmp1, 16 |
| vpsrld ymm_tmp1, ymm_tmp1, 16 |
| vpslld ymm_tmp2, ymm_tmp2, 16 |
| vpsrld ymm_tmp2, ymm_tmp2, 16 |
| vpackusdw ymm_xpos_frac, ymm_tmp1, ymm_tmp2 |
| vpermq ymm_xpos_frac, ymm_xpos_frac, 11011000b |
| vmovd xmm_tmp0, i_scalexd |
| vpslld xmm_tmp0, xmm_tmp0, 4 |
| vpbroadcastw ymm_tmp1, xmm_tmp0 |
| vmovdqa ymm_xpos_frac_inc, ymm_tmp1 |
| vpsrld xmm_tmp0, xmm_tmp0, 16 |
| vpsubw ymm_tmp0, ymm_tmp0, ymm_tmp3 |
| vpbroadcastb ymm_tmp0, xmm_tmp0 |
| vmovdqa ymm_xpos_int_inc, ymm_tmp0 |
| vmovdqa ymm_xpos_int_begin, ymm_xpos_int |
| vmovdqa ymm_xpos_frac_begin, ymm_xpos_frac |
| |
| cmp i_scalex, 4 << 16 |
| ja .scalex_above4 |
| cmp i_scalex, 2 << 16 |
| ja .scalex_above2_beloweq4 |
| add i_scalex, i_scalex |
| %xdefine i_scalex2 i_scalex |
| %undef i_scalex |
| AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample2xOrLess_16px, 1 |
| shr i_scalex2, 1 |
| %xdefine i_scalex i_scalex2 |
| %undef i_scalex2 |
| jmp .final_row |
| .scalex_above2_beloweq4: |
| add i_scalex, i_scalex |
| %xdefine i_scalex2 i_scalex |
| %undef i_scalex |
| AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample4xOrLess_16px, 1 |
| shr i_scalex2, 1 |
| %xdefine i_scalex i_scalex2 |
| %undef i_scalex2 |
| jmp .final_row |
| .scalex_above4: |
| cmp i_scalex, 8 << 16 |
| ja .scalex_above8 |
| add i_scalex, i_scalex |
| %xdefine i_scalex2 i_scalex |
| %undef i_scalex |
| AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample8xOrLess_16px, 1 |
| shr i_scalex2, 1 |
| %xdefine i_scalex i_scalex2 |
| %undef i_scalex2 |
| jmp .final_row |
| .scalex_above8: |
| %xdefine ymm_xfrac0 ymm_xpos_frac |
| %xdefine ymm_xfrac1 ymm_xpos_int |
| %xdefine ymm_xfrac0_begin ymm_xpos_int_begin |
| %xdefine ymm_xfrac1_begin ymm_xpos_frac_begin |
| %xdefine ymm_xfrac_inc ymm_xpos_frac_inc |
| %undef ymm_xpos_int |
| %undef ymm_xpos_frac |
| %undef ymm_xpos_int_begin |
| %undef ymm_xpos_frac_begin |
| %undef ymm_xpos_int_inc |
| %undef ymm_xpos_frac_inc |
| AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_ffff |
| vpermq ymm_xfrac0, ymm_tmp0, 01001110b |
| vpermq ymm_xfrac1, ymm_xfrac1, 01001110b |
| vmovdqa ymm_xfrac0_begin, ymm_xfrac0 |
| vmovdqa ymm_xfrac1_begin, ymm_xfrac1 |
| vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0 |
| vpmullw ymm_tmp0, ymm_tmp0, ymm_xfrac_inc |
| vpunpcklwd ymm_tmp0, ymm_tmp0, ymm_xfrac_inc |
| vmovdqa ymm_xfrac_inc, ymm_tmp0 |
| AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearFastDownsample_16px, 1 |
| |
| .final_row: |
| mov p_src_row0, i_ypos |
| shr p_src_row0, 15 |
| imul p_src_row0, i_src_stride |
| add p_src_row0, p_src |
| mov i_xpos, 1 << 15 |
| mov i_width_cnt, i_dst_width |
| |
| .final_row_width: |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| movzx r_tmp0, byte [p_src_row0 + r_tmp0] |
| mov [p_dst], r_tmp0b |
| add p_dst, 1 |
| add i_xpos, i_scalex |
| sub i_width_cnt, 1 |
| jg .final_row_width |
| |
| .done: |
| vzeroupper |
| %ifdef X86_32 |
| mov esp, [esp] |
| %endif |
| POP_XMM |
| LOAD_7_PARA_POP |
| %ifndef X86_32 |
| %ifdef WIN64 |
| pop rsi |
| pop rdi |
| %endif |
| pop rbp |
| pop rbx |
| pop r13 |
| pop r12 |
| %endif |
| ret |
| %undef p_dst |
| %undef i_dst_stride_less_width |
| %undef i_dst_width |
| %undef i_dst_height |
| %undef p_src |
| %undef i_src_stride |
| %undef i_scalex |
| %undef i_scalexd |
| %undef i_scaleyd |
| %undef i_xpos |
| %undef i_ypos |
| %undef i_yposd |
| %undef p_src_row0 |
| %undef p_src_row1 |
| %undef i_width_cnt |
| %undef r_tmp0 |
| %undef r_tmp0b |
| %undef ymm_xpos_frac |
| %undef ymm_xpos_frac_inc |
| %undef ymm_xpos_int |
| %undef ymm_xpos_int_inc |
| %undef ymm_yfrac0 |
| %undef ymm_yfrac1 |
| %undef xmm_tmp0 |
| %undef ymm_tmp0 |
| %undef xmm_tmp1 |
| %undef ymm_tmp1 |
| %undef xmm_tmp2 |
| %undef ymm_tmp2 |
| %undef xmm_tmp3 |
| %undef ymm_tmp3 |
| %undef xmm_tmp4 |
| %undef ymm_tmp4 |
| %undef xmm_tmp5 |
| %undef ymm_tmp5 |
| %undef ymm_ffff |
| %undef ymm_0 |
| %undef ymm_xpos_int_begin |
| %undef ymm_xpos_frac_begin |
| %undef ymm_xfrac0 |
| %undef ymm_xfrac1 |
| %undef ymm_xfrac0_begin |
| %undef ymm_xfrac1_begin |
| %undef ymm_xfrac_inc |
| %undef xmm_shufb_0000000088888888 |
| %undef xmm_shufb_000044448888CCCC |
| |
| ;************************************************************************************************************** |
| ;void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, |
| ; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, |
| ; uint32_t uiScaleY); |
| ; |
| ;************************************************************************************************************** |
| |
| WELS_EXTERN GeneralBilinearAccurateDownsampler_avx2 |
| %assign push_num 0 |
| %ifndef X86_32 |
| push r12 |
| push r13 |
| push rbx |
| push rbp |
| %assign push_num 4 |
| %ifdef WIN64 |
| push rdi |
| push rsi |
| %assign push_num push_num + 2 |
| %endif |
| %endif |
| LOAD_7_PARA |
| PUSH_XMM 16 |
| SIGN_EXTENSION r1, r1d |
| SIGN_EXTENSION r2, r2d |
| SIGN_EXTENSION r3, r3d |
| SIGN_EXTENSION r5, r5d |
| ZERO_EXTENSION r6d |
| sub r1, r2 ; dst_stride - dst_width |
| add r6, r6 ; 2 * scalex |
| %ifdef X86_32 |
| vmovd xmm0, arg8 |
| vmovd xmm1, esp |
| and esp, -32 |
| %ifdef X86_32_PICASM |
| sub esp, 8 * 4 + 10 * 32 |
| %else |
| sub esp, 8 * 4 + 8 * 32 |
| %endif |
| vmovd [esp], xmm1 |
| %define p_dst r0 |
| %define i_dst_stride_less_width [esp + 1 * 4] |
| %define i_dst_width [esp + 2 * 4] |
| %define i_dst_height dword [esp + 3 * 4] |
| %define p_src [esp + 4 * 4] |
| %define i_src_stride [esp + 5 * 4] |
| %define i_scalex r6 |
| %define i_scalexd r6d |
| %define i_scaleyd [esp + 6 * 4] |
| %define i_xpos r2 |
| %define i_ypos [esp + 7 * 4] |
| %define i_yposd dword [esp + 7 * 4] |
| %define p_src_row0 r3 |
| %define p_src_row1 r4 |
| %define i_width_cnt r5 |
| %define r_tmp0 r1 |
| %define r_tmp0b r1b |
| %define ymm_xpos_frac ymm1 |
| %define ymm_xpos_frac_inc [esp + 8 * 4] |
| %define ymm_xpos_int ymm3 |
| %define ymm_xpos_int_inc [esp + 8 * 4 + 1 * 32] |
| %define ymm_yfrac0 [esp + 8 * 4 + 2 * 32] |
| %define ymm_yfrac1 [esp + 8 * 4 + 3 * 32] |
| %define xmm_tmp0 xmm7 |
| %define ymm_tmp0 ymm7 |
| %define xmm_tmp1 xmm0 |
| %define ymm_tmp1 ymm0 |
| %define xmm_tmp2 xmm2 |
| %define ymm_tmp2 ymm2 |
| %define xmm_tmp3 xmm4 |
| %define ymm_tmp3 ymm4 |
| %define xmm_tmp4 xmm5 |
| %define ymm_tmp4 ymm5 |
| %define xmm_tmp5 xmm6 |
| %define ymm_tmp5 ymm6 |
| %define ymm_0 [esp + 8 * 4 + 4 * 32] |
| %define ymm_7fff [esp + 8 * 4 + 5 * 32] |
| %define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32] |
| %define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32] |
| %ifdef X86_32_PICASM |
| %define ymm_db80h [esp + 8 * 4 + 8 * 32] |
| %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 32] |
| %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 9 * 32 + 16] |
| vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4 |
| vpcmpeqb ymm_tmp5, ymm_tmp5, ymm_tmp5 |
| vpsubb ymm_tmp4, ymm_tmp4, ymm_tmp5 |
| vpsllw ymm_tmp3, ymm_tmp4, 3 |
| vpslldq ymm_tmp3, ymm_tmp3, 8 |
| vmovdqa xmm_shufb_0000000088888888, xmm_tmp3 |
| vpsllq ymm_tmp5, ymm_tmp4, 34 |
| vpaddb ymm_tmp5, ymm_tmp5, ymm_tmp3 |
| vmovdqa xmm_shufb_000044448888CCCC, xmm_tmp5 |
| vpsllw ymm_tmp4, ymm_tmp4, 7 |
| vmovdqa ymm_db80h, ymm_tmp4 |
| %else |
| %define ymm_db80h [db80h_256] |
| %define xmm_shufb_0000000088888888 [shufb_0000000088888888] |
| %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC] |
| %endif |
| mov i_dst_stride_less_width, r1 |
| mov i_dst_width, r2 |
| mov i_dst_height, r3 |
| mov p_src, r4 |
| mov i_src_stride, r5 |
| vmovd i_scaleyd, xmm0 |
| vpxor xmm0, xmm0, xmm0 |
| vmovdqa ymm_0, ymm0 |
| vpcmpeqw ymm0, ymm0, ymm0 |
| vpsrlw ymm0, ymm0, 1 |
| vmovdqa ymm_7fff, ymm0 |
| %else |
| %define p_dst r0 |
| %define i_dst_stride_less_width r1 |
| %define i_dst_width r2 |
| %define i_dst_height r3 |
| %define p_src r4 |
| %define i_src_stride r5 |
| %define i_scalex r6 |
| %define i_scalexd r6d |
| %define i_scaleyd dword arg8d |
| %define i_xpos r12 |
| %define i_ypos r13 |
| %define i_yposd r13d |
| %define p_src_row0 rbp |
| %ifdef WIN64 |
| %define p_src_row1 rsi |
| %define i_width_cnt rdi |
| %else |
| %define p_src_row1 r11 |
| %define i_width_cnt rax |
| %endif |
| %define r_tmp0 rbx |
| %define r_tmp0b bl |
| %define ymm_0 ymm0 |
| %define ymm_xpos_frac ymm1 |
| %define ymm_xpos_int ymm3 |
| %define ymm_xpos_frac_inc ymm2 |
| %define ymm_xpos_int_inc ymm4 |
| %define ymm_yfrac0 ymm5 |
| %define ymm_yfrac1 ymm6 |
| %define xmm_tmp0 xmm7 |
| %define ymm_tmp0 ymm7 |
| %define xmm_tmp1 xmm8 |
| %define ymm_tmp1 ymm8 |
| %define xmm_tmp2 xmm9 |
| %define ymm_tmp2 ymm9 |
| %define xmm_tmp3 xmm10 |
| %define ymm_tmp3 ymm10 |
| %define xmm_tmp4 xmm11 |
| %define ymm_tmp4 ymm11 |
| %define xmm_tmp5 xmm12 |
| %define ymm_tmp5 ymm12 |
| %define ymm_7fff ymm13 |
| %define ymm_xpos_int_begin ymm14 |
| %define ymm_xpos_frac_begin ymm15 |
| %define ymm_db80h [db80h_256] |
| %define xmm_shufb_0000000088888888 [shufb_0000000088888888] |
| %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC] |
| vpxor ymm_0, ymm_0, ymm_0 |
| vpcmpeqw ymm_7fff, ymm_7fff, ymm_7fff |
| vpsrlw ymm_7fff, ymm_7fff, 1 |
| %endif |
| |
| sub i_dst_height, 1 |
| je .final_row |
| jl .done |
| |
| mov i_yposd, 1 << 14 |
| vmovd xmm_tmp0, i_scalexd |
| vpbroadcastd ymm_tmp0, xmm_tmp0 |
| vpslld ymm_tmp1, ymm_tmp0, 2 |
| vpslld ymm_tmp2, ymm_tmp0, 3 |
| vpaddd ymm_tmp3, ymm_tmp1, ymm_tmp2 |
| vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4 |
| vpblendd ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b |
| vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b |
| vpaddd ymm_tmp3, ymm_tmp0, ymm_tmp0 |
| vpblendd ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b |
| vpblendd ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b |
| vpaddd ymm_tmp0, ymm_tmp3, ymm_tmp0 |
| vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp0 |
| vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp0 |
| vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3 |
| vpsrld ymm_tmp3, ymm_tmp3, 31 |
| vpslld ymm_tmp3, ymm_tmp3, 15 |
| vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3 |
| vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp3 |
| vpsrld ymm_xpos_int, ymm_tmp1, 16 |
| vpsrld ymm_tmp0, ymm_tmp2, 16 |
| vpackssdw ymm_xpos_int, ymm_xpos_int, ymm_tmp0 |
| vpermq ymm_xpos_int, ymm_xpos_int, 11011000b |
| vpackuswb ymm_xpos_int, ymm_xpos_int, ymm_xpos_int |
| vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3 |
| vpsubb ymm_tmp0, ymm_xpos_int, ymm_tmp3 |
| vpunpcklbw ymm_xpos_int, ymm_xpos_int, ymm_tmp0 |
| vpslld ymm_tmp1, ymm_tmp1, 16 |
| vpsrld ymm_tmp1, ymm_tmp1, 16 |
| vpslld ymm_tmp2, ymm_tmp2, 16 |
| vpsrld ymm_tmp2, ymm_tmp2, 16 |
| vpackusdw ymm_xpos_frac, ymm_tmp1, ymm_tmp2 |
| vpermq ymm_xpos_frac, ymm_xpos_frac, 11011000b |
| vpsrlw ymm_xpos_frac, ymm_xpos_frac, 1 |
| vmovd xmm_tmp0, i_scalexd |
| vpslld xmm_tmp0, xmm_tmp0, 4 |
| vpbroadcastw ymm_tmp1, xmm_tmp0 |
| vpsrlw ymm_tmp1, ymm_tmp1, 1 |
| vmovdqa ymm_xpos_frac_inc, ymm_tmp1 |
| vpsrld xmm_tmp0, xmm_tmp0, 16 |
| vpsubw ymm_tmp0, ymm_tmp0, ymm_tmp3 |
| vpbroadcastb ymm_tmp0, xmm_tmp0 |
| vmovdqa ymm_xpos_int_inc, ymm_tmp0 |
| vmovdqa ymm_xpos_int_begin, ymm_xpos_int |
| vmovdqa ymm_xpos_frac_begin, ymm_xpos_frac |
| |
| cmp i_scalex, 4 << 16 |
| ja .scalex_above4 |
| cmp i_scalex, 2 << 16 |
| ja .scalex_above2_beloweq4 |
| add i_scalex, i_scalex |
| %xdefine i_scalex2 i_scalex |
| %undef i_scalex |
| AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample2xOrLess_16px, 0 |
| shr i_scalex2, 1 |
| %xdefine i_scalex i_scalex2 |
| %undef i_scalex2 |
| jmp .final_row |
| .scalex_above2_beloweq4: |
| add i_scalex, i_scalex |
| %xdefine i_scalex2 i_scalex |
| %undef i_scalex |
| AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample4xOrLess_16px, 0 |
| shr i_scalex2, 1 |
| %xdefine i_scalex i_scalex2 |
| %undef i_scalex2 |
| jmp .final_row |
| .scalex_above4: |
| cmp i_scalex, 8 << 16 |
| ja .scalex_above8 |
| add i_scalex, i_scalex |
| %xdefine i_scalex2 i_scalex |
| %undef i_scalex |
| AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample8xOrLess_16px, 0 |
| shr i_scalex2, 1 |
| %xdefine i_scalex i_scalex2 |
| %undef i_scalex2 |
| jmp .final_row |
| .scalex_above8: |
| %xdefine ymm_xfrac0 ymm_xpos_frac |
| %xdefine ymm_xfrac1 ymm_xpos_int |
| %xdefine ymm_xfrac0_begin ymm_xpos_int_begin |
| %xdefine ymm_xfrac1_begin ymm_xpos_frac_begin |
| %xdefine ymm_xfrac_inc ymm_xpos_frac_inc |
| %undef ymm_xpos_int |
| %undef ymm_xpos_frac |
| %undef ymm_xpos_int_begin |
| %undef ymm_xpos_frac_begin |
| %undef ymm_xpos_int_inc |
| %undef ymm_xpos_frac_inc |
| AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_7fff |
| vpermq ymm_xfrac0, ymm_tmp0, 01001110b |
| vpermq ymm_xfrac1, ymm_xfrac1, 01001110b |
| vmovdqa ymm_xfrac0_begin, ymm_xfrac0 |
| vmovdqa ymm_xfrac1_begin, ymm_xfrac1 |
| vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0 |
| vpmullw ymm_tmp0, ymm_tmp0, ymm_xfrac_inc |
| vpunpcklwd ymm_tmp0, ymm_tmp0, ymm_xfrac_inc |
| vmovdqa ymm_xfrac_inc, ymm_tmp0 |
| AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearAccurateDownsample_16px, 0 |
| |
| .final_row: |
| mov p_src_row0, i_ypos |
| shr p_src_row0, 15 |
| imul p_src_row0, i_src_stride |
| add p_src_row0, p_src |
| mov i_xpos, 1 << 15 |
| mov i_width_cnt, i_dst_width |
| |
| .final_row_width: |
| mov r_tmp0, i_xpos |
| shr r_tmp0, 16 |
| movzx r_tmp0, byte [p_src_row0 + r_tmp0] |
| mov [p_dst], r_tmp0b |
| add p_dst, 1 |
| add i_xpos, i_scalex |
| sub i_width_cnt, 1 |
| jg .final_row_width |
| |
| .done: |
| vzeroupper |
| %ifdef X86_32 |
| mov esp, [esp] |
| %endif |
| POP_XMM |
| LOAD_7_PARA_POP |
| %ifndef X86_32 |
| %ifdef WIN64 |
| pop rsi |
| pop rdi |
| %endif |
| pop rbp |
| pop rbx |
| pop r13 |
| pop r12 |
| %endif |
| ret |
| %undef p_dst |
| %undef i_dst_stride_less_width |
| %undef i_dst_width |
| %undef i_dst_height |
| %undef p_src |
| %undef i_src_stride |
| %undef i_scalex |
| %undef i_scalexd |
| %undef i_scaleyd |
| %undef i_xpos |
| %undef i_ypos |
| %undef i_yposd |
| %undef p_src_row0 |
| %undef p_src_row1 |
| %undef i_width_cnt |
| %undef r_tmp0 |
| %undef r_tmp0b |
| %undef ymm_xpos_frac |
| %undef ymm_xpos_frac_inc |
| %undef ymm_xpos_int |
| %undef ymm_xpos_int_inc |
| %undef ymm_yfrac0 |
| %undef ymm_yfrac1 |
| %undef xmm_tmp0 |
| %undef ymm_tmp0 |
| %undef xmm_tmp1 |
| %undef ymm_tmp1 |
| %undef xmm_tmp2 |
| %undef ymm_tmp2 |
| %undef xmm_tmp3 |
| %undef ymm_tmp3 |
| %undef xmm_tmp4 |
| %undef ymm_tmp4 |
| %undef xmm_tmp5 |
| %undef ymm_tmp5 |
| %undef ymm_0 |
| %undef ymm_7fff |
| %undef ymm_xpos_int_begin |
| %undef ymm_xpos_frac_begin |
| %undef ymm_xfrac0 |
| %undef ymm_xfrac1 |
| %undef ymm_xfrac0_begin |
| %undef ymm_xfrac1_begin |
| %undef ymm_xfrac_inc |
| %undef ymm_db80h |
| %undef xmm_shufb_0000000088888888 |
| %undef xmm_shufb_000044448888CCCC |
| |
| %endif |