Refine mc_put_8tap

Performance speedup over lsx is around 68%~156%.

Change-Id: I0b39cd0e05e3cbd84fded121d29a91ea2a620f03
diff --git a/src/loongarch/mc.S b/src/loongarch/mc.S
index 97887de..f4cd61f 100644
--- a/src/loongarch/mc.S
+++ b/src/loongarch/mc.S
@@ -2634,114 +2634,49 @@
     vhaddw.q.d  \in0,  \in0,  \in0
 .endm
 .macro PUT_H_8W in0
-    vbsrl.v          vr2,    \in0,  1
-    vbsrl.v          vr3,    \in0,  2
-    vbsrl.v          vr4,    \in0,  3
-    vbsrl.v          vr5,    \in0,  4
-    vbsrl.v          vr6,    \in0,  5
-    vbsrl.v          vr7,    \in0,  6
-    vbsrl.v          vr10,   \in0,  7
-    vilvl.d          vr2,    vr2,   \in0
-    vilvl.d          vr3,    vr4,   vr3
-    vilvl.d          vr4,    vr6,   vr5
-    vilvl.d          vr5,    vr10,  vr7
-    vdp2.h.bu.b      \in0,   vr2,   vr8
-    vdp2.h.bu.b      vr2,    vr3,   vr8
-    vdp2.h.bu.b      vr3,    vr4,   vr8
-    vdp2.h.bu.b      vr4,    vr5,   vr8
-    vhaddw.d.h       \in0
-    vhaddw.d.h       vr2
-    vhaddw.d.h       vr3
-    vhaddw.d.h       vr4
-    vpickev.w        \in0,   vr2,   \in0
-    vpickev.w        vr2,    vr4,   vr3
-    vpickev.h        \in0,   vr2,   \in0
+    vshuf.b          vr2,    \in0,  \in0,   vr6
+    vshuf.b          vr3,    \in0,  \in0,   vr7
+    vshuf.b          vr4,    \in0,  \in0,   vr8
+    vmulwev.h.bu.b   vr12,   vr2,   vr10
+    vmulwev.h.bu.b   vr13,   vr3,   vr11
+    vmulwev.h.bu.b   vr14,   vr3,   vr10
+    vmulwev.h.bu.b   vr15,   vr4,   vr11
+    vmaddwod.h.bu.b  vr12,   vr2,   vr10
+    vmaddwod.h.bu.b  vr13,   vr3,   vr11
+    vmaddwod.h.bu.b  vr14,   vr3,   vr10
+    vmaddwod.h.bu.b  vr15,   vr4,   vr11
+    vadd.h           vr12,   vr12,  vr13
+    vadd.h           vr14,   vr14,  vr15
+    vhaddw.w.h       vr12,   vr12,  vr12
+    vhaddw.w.h       vr14,   vr14,  vr14
+    vpickev.h        \in0,   vr14,  vr12
     vadd.h           \in0,   \in0,  vr9
 .endm
-.macro FILTER_8TAP_4W in0
-    vbsrl.v          vr10,   \in0,  1
-    vbsrl.v          vr11,   \in0,  2
-    vbsrl.v          vr12,   \in0,  3
-    vilvl.d          vr10,   vr10, \in0
-    vilvl.d          vr11,   vr12,  vr11
-    vdp2.h.bu.b      vr7,    vr10,  vr8
-    vdp2.h.bu.b      vr10,   vr11,  vr8
-    vhaddw.d.h       vr7
-    vhaddw.d.h       vr10
-    vpickev.w        \in0,   vr10,  vr7
-.endm
+
+const subpel_h_shuf0
+.byte 0, 1, 2, 3, 1, 2, 3, 4, 16, 17, 18, 19, 17, 18, 19, 20
+endconst
+const subpel_h_shuf1
+.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+endconst
+
 .macro FILTER_8TAP_8W in0
-    vbsrl.v         vr10,    \in0,  1
-    vbsrl.v         vr11,    \in0,  2
-    vbsrl.v         vr12,    \in0,  3
-    vbsrl.v         vr13,    \in0,  4
-    vbsrl.v         vr14,    \in0,  5
-    vbsrl.v         vr15,    \in0,  6
-    vbsrl.v         vr16,    \in0,  7
-    vilvl.d         vr10,    vr10,  \in0
-    vilvl.d         vr11,    vr12,  vr11
-    vilvl.d         vr12,    vr14,  vr13
-    vilvl.d         vr13,    vr16,  vr15
-    vdp2.h.bu.b     vr14,    vr10,  vr8
-    vdp2.h.bu.b     vr15,    vr11,  vr8
-    vdp2.h.bu.b     vr16,    vr12,  vr8
-    vdp2.h.bu.b     vr17,    vr13,  vr8
-    vhaddw.d.h      vr14
-    vhaddw.d.h      vr15
-    vhaddw.d.h      vr16
-    vhaddw.d.h      vr17
-    vpickev.w       vr13,    vr15,  vr14
-    vpickev.w       vr14,    vr17,  vr16
-    vpickev.h       \in0,    vr14,  vr13 //x0 ... x7
-    vsrari.h        \in0,    \in0,  2
-.endm
-.macro FILTER_8TAP_8W_CLIP_STORE
-    vdp2.w.h        vr12,    vr0,   vr9
-    vdp2.w.h        vr13,    vr1,   vr9
-    vdp2.w.h        vr14,    vr2,   vr9
-    vdp2.w.h        vr15,    vr3,   vr9
-    vdp2.w.h        vr16,    vr4,   vr9
-    vdp2.w.h        vr17,    vr5,   vr9
-    vdp2.w.h        vr18,    vr6,   vr9
-    vdp2.w.h        vr19,    vr7,   vr9
-    vhaddw.q.w      vr12
-    vhaddw.q.w      vr13
-    vhaddw.q.w      vr14
-    vhaddw.q.w      vr15
-    vhaddw.q.w      vr16
-    vhaddw.q.w      vr17
-    vhaddw.q.w      vr18
-    vhaddw.q.w      vr19
-    vpackev.w       vr12,    vr13,  vr12
-    vpackev.w       vr13,    vr15,  vr14
-    vpackev.d       vr12,    vr13,  vr12
-    vpackev.w       vr14,    vr17,  vr16
-    vpackev.w       vr15,    vr19,  vr18
-    vpackev.d       vr13,    vr15,  vr14
-    vssrarni.hu.w   vr13,    vr12,  10
-    vssrani.bu.h    vr13,    vr13,  0
-    vstelm.d        vr13,    a0,    0,   0
-    add.d           a0,      a0,    a1
-.endm
-.macro VEXTRINS_Hx8 in0
-    vextrins.h      vr0,     \in0,  0x70
-    vextrins.h      vr1,     \in0,  0x71
-    vextrins.h      vr2,     \in0,  0x72
-    vextrins.h      vr3,     \in0,  0x73
-    vextrins.h      vr4,     \in0,  0x74
-    vextrins.h      vr5,     \in0,  0x75
-    vextrins.h      vr6,     \in0,  0x76
-    vextrins.h      vr7,     \in0,  0x77
-.endm
-.macro VBSRL_Vx8
-    vbsrl.v         vr0,     vr0,   2
-    vbsrl.v         vr1,     vr1,   2
-    vbsrl.v         vr2,     vr2,   2
-    vbsrl.v         vr3,     vr3,   2
-    vbsrl.v         vr4,     vr4,   2
-    vbsrl.v         vr5,     vr5,   2
-    vbsrl.v         vr6,     vr6,   2
-    vbsrl.v         vr7,     vr7,   2
+    vshuf.b         vr13,    \in0,  \in0,  vr7
+    vshuf.b         vr14,    \in0,  \in0,  vr11
+    vshuf.b         vr15,    \in0,  \in0,  vr12
+    vmulwev.h.bu.b  vr16,    vr13,  vr8
+    vmulwev.h.bu.b  vr17,    vr14,  vr10
+    vmulwev.h.bu.b  vr18,    vr14,  vr8
+    vmulwev.h.bu.b  vr19,    vr15,  vr10
+    vmaddwod.h.bu.b vr16,    vr13,  vr8
+    vmaddwod.h.bu.b vr17,    vr14,  vr10
+    vmaddwod.h.bu.b vr18,    vr14,  vr8
+    vmaddwod.h.bu.b vr19,    vr15,  vr10
+    vadd.h          vr16,    vr16,  vr17
+    vadd.h          vr18,    vr18,  vr19
+    vhaddw.w.h      vr16,    vr16,  vr16
+    vhaddw.w.h      \in0,    vr18,  vr18
+    vssrarni.h.w    \in0,    vr16,  2
 .endm
 
 .macro PUT_8TAP_8BPC_LSX lable
@@ -2910,9 +2845,7 @@
     addi.w           t5,     a6,    -1
     slli.w           t5,     t5,    3
     add.w            t1,     t1,    t5
-    add.d            t1,     t6,    t1 //fh's offset
-    vldrepl.d        vr8,    t1,    0
-    addi.d           a2,     a2,    -3
+    add.d            t7,     t6,    t1 //fh's offset
     li.w             t1,     34
     vreplgr2vr.h     vr9,    t1
 
@@ -2936,75 +2869,72 @@
     .dword .l_\lable\()put_h_2w   - .l_\lable\()put_h_jtable
 
 .l_\lable\()put_h_2w:
+    addi.d           t7,     t7,    2
+    addi.d           a2,     a2,    -1
+    vldrepl.w        vr8,    t7,    0
+    la.local         t7,     subpel_h_shuf0
+    vld              vr7,    t7,    0
+.l_\lable\()put_h_2w_loop:
     vld              vr0,    a2,    0
     vldx             vr1,    a2,    a3
     add.d            a2,     a2,    t2
 
-    vbsrl.v          vr2,    vr0,   1
-    vilvl.d          vr0,    vr2,   vr0
-    vdp2.h.bu.b      vr2,    vr0,   vr8
-    vhaddw.w.h       vr0,    vr2,   vr2
-    vhaddw.d.w       vr0,    vr0,   vr0
-    vbsrl.v          vr2,    vr1,   1
-    vilvl.d          vr1,    vr2,   vr1
-    vdp2.h.bu.b      vr2,    vr1,   vr8
-    vhaddw.w.h       vr1,    vr2,   vr2
-    vhaddw.d.w       vr1,    vr1,   vr1
-    vpickev.w        vr0,    vr1,   vr0
+    vshuf.b          vr0,    vr1,   vr0,   vr7
+    vdp2.h.bu.b      vr1,    vr0,   vr8
+    vhaddw.w.h       vr0,    vr1,   vr1
     vpickev.h        vr0,    vr0,   vr0
     vadd.h           vr0,    vr0,   vr9
     vssrani.bu.h     vr0,    vr0,   6
 
-    vstelm.h         vr0,    a0,    0,   0
+    vstelm.h         vr0,    a0,    0,     0
     add.d            a0,     a0,    a1
-    vstelm.h         vr0,    a0,    0,   1
+    vstelm.h         vr0,    a0,    0,     1
     add.d            a0,     a0,    a1
     addi.w           a5,     a5,    -2
-    bnez             a5,     .l_\lable\()put_h_2w
+    bnez             a5,     .l_\lable\()put_h_2w_loop
     b                .l_\lable\()end_put_8tap
 
 .l_\lable\()put_h_4w:
+    addi.d           t7,     t7,    2
+    addi.d           a2,     a2,    -1
+    vldrepl.w        vr8,    t7,    0
+    la.local         t7,     subpel_h_shuf1
+    vld              vr7,    t7,    0
+.l_\lable\()put_h_4w_loop:
     vld              vr0,    a2,    0
     vldx             vr1,    a2,    a3
     add.d            a2,     a2,    t2
 
-    vbsrl.v          vr2,    vr0,   1
-    vbsrl.v          vr3,    vr0,   2
-    vbsrl.v          vr4,    vr0,   3
-    vilvl.d          vr0,    vr2,   vr0 //x0 x1
-    vilvl.d          vr2,    vr4,   vr3 //x2 x3
-    vdp2.h.bu.b      vr3,    vr0,   vr8
-    vdp2.h.bu.b      vr4,    vr2,   vr8
-    vhaddw.w.h       vr0,    vr3,   vr3
-    vhaddw.d.w       vr0,    vr0,   vr0
-    vhaddw.w.h       vr2,    vr4,   vr4
-    vhaddw.d.w       vr2,    vr2,   vr2
-    vpickev.w        vr5,    vr2,   vr0
-    vbsrl.v          vr2,    vr1,   1
-    vbsrl.v          vr3,    vr1,   2
-    vbsrl.v          vr4,    vr1,   3
-    vilvl.d          vr0,    vr2,   vr1 //x0 x1
-    vilvl.d          vr2,    vr4,   vr3 //x2 x3
-    vdp2.h.bu.b      vr3,    vr0,   vr8
-    vdp2.h.bu.b      vr4,    vr2,   vr8
-    vhaddw.w.h       vr0,    vr3,   vr3
-    vhaddw.d.w       vr0,    vr0,   vr0
-    vhaddw.w.h       vr2,    vr4,   vr4
-    vhaddw.d.w       vr2,    vr2,   vr2
-    vpickev.w        vr6,    vr2,   vr0
-    vpickev.h        vr0,    vr6,   vr5
+    vshuf.b          vr0,    vr0,   vr0,   vr7
+    vshuf.b          vr1,    vr1,   vr1,   vr7
+    vmulwev.h.bu.b   vr2,    vr0,   vr8
+    vmulwev.h.bu.b   vr3,    vr1,   vr8
+    vmaddwod.h.bu.b  vr2,    vr0,   vr8
+    vmaddwod.h.bu.b  vr3,    vr1,   vr8
+    vhaddw.w.h       vr0,    vr2,   vr2
+    vhaddw.w.h       vr1,    vr3,   vr3
+    vpickev.h        vr0,    vr1,   vr0
     vadd.h           vr0,    vr0,   vr9
     vssrani.bu.h     vr0,    vr0,   6
 
-    vstelm.w         vr0,    a0,    0,    0
+    vstelm.w         vr0,    a0,    0,     0
     add.d            a0,     a0,    a1
-    vstelm.w         vr0,    a0,    0,    1
+    vstelm.w         vr0,    a0,    0,     1
     add.d            a0,     a0,    a1
     addi.d           a5,     a5,    -2
-    bnez             a5,     .l_\lable\()put_h_4w
+    bnez             a5,     .l_\lable\()put_h_4w_loop
     b                .l_\lable\()end_put_8tap
 
 .l_\lable\()put_h_8w:
+    fld.d            f10,    t7,    0
+    vreplvei.w       vr11,   vr10,  1
+    vreplvei.w       vr10,   vr10,  0
+    la.local         t7,     subpel_h_shuf1
+    vld              vr6,    t7,    0
+    vaddi.bu         vr7,    vr6,   4
+    vaddi.bu         vr8,    vr6,   8
+    addi.d           a2,     a2,    -3
+.l_\lable\()put_h_8w_loop:
     vld              vr0,    a2,    0
     vldx             vr1,    a2,    a3
     add.d            a2,     a2,    t2
@@ -3016,35 +2946,41 @@
     vstelm.d         vr1,    a0,    0,    1
     add.d            a0,     a0,    a1
     addi.w           a5,     a5,    -2
-    bnez             a5,     .l_\lable\()put_h_8w
+    bnez             a5,     .l_\lable\()put_h_8w_loop
     b                .l_\lable\()end_put_8tap
 
 .l_\lable\()put_h_16w:
 .l_\lable\()put_h_32w:
 .l_\lable\()put_h_64w:
 .l_\lable\()put_h_128w:
+    fld.d            f10,    t7,    0
+    vreplvei.w       vr11,   vr10,  1
+    vreplvei.w       vr10,   vr10,  0
+    la.local         t7,     subpel_h_shuf1
+    vld              vr6,    t7,    0
+    vaddi.bu         vr7,    vr6,   4
+    vaddi.bu         vr8,    vr6,   8
+    addi.d           a2,     a2,    -3
     addi.d           t0,     a2,    0 //src
     addi.w           t5,     a5,    0 //h
     addi.d           t8,     a0,    0 //dst
 .l_\lable\()put_h_16w_loop:
     vld              vr0,    a2,    0
-    vldx             vr1,    a2,    a3
-    add.d            a2,     a2,    t2
+    vld              vr1,    a2,    8
+    add.d            a2,     a2,    a3
     PUT_H_8W         vr0
     PUT_H_8W         vr1
     vssrani.bu.h     vr1,    vr0,   6
-    vstelm.d         vr1,    a0,    0,   0
+    vst              vr1,    a0,    0
     add.d            a0,     a0,    a1
-    vstelm.d         vr1,    a0,    0,   1
-    add.d            a0,     a0,    a1
-    addi.d           a5,     a5,    -2
+    addi.d           a5,     a5,    -1
     bnez             a5,     .l_\lable\()put_h_16w_loop
-    addi.d           a2,     t0,    8
-    addi.d           t0,     t0,    8
-    addi.d           a0,     t8,    8
-    addi.d           t8,     t8,    8
+    addi.d           a2,     t0,    16
+    addi.d           t0,     t0,    16
+    addi.d           a0,     t8,    16
+    addi.d           t8,     t8,    16
     addi.w           a5,     t5,    0
-    addi.w           a4,     a4,    -8
+    addi.w           a4,     a4,    -16
     bnez             a4,     .l_\lable\()put_h_16w_loop
     b                .l_\lable\()end_put_8tap
 
@@ -3065,6 +3001,12 @@
     vldrepl.d        vr8,    t1,    0
     sub.d            a2,     a2,    t3
 
+    vilvl.h          vr8,    vr8,   vr8
+    vreplvei.w       vr9,    vr8,   1
+    vreplvei.w       vr10,   vr8,   2
+    vreplvei.w       vr11,   vr8,   3
+    vreplvei.w       vr8,    vr8,   0
+
     clz.w            t1,     a4
     li.w             t5,     24
     sub.w            t1,     t1,    t5
@@ -3094,36 +3036,43 @@
     fldx.s           f5,     a2,    t2
     fldx.s           f6,     a2,    t3
     add.d            a2,     a2,    t4
-    vilvl.b          vr0,    vr1,   vr0
-    vilvl.b          vr1,    vr3,   vr2
-    vilvl.b          vr2,    vr5,   vr4
-    vilvl.b          vr3,    vr7,   vr6
-    vilvl.h          vr0,    vr1,   vr0
-    vilvl.h          vr1,    vr3,   vr2
-    vilvl.w          vr0,    vr1,   vr0
 
+    vilvl.h          vr0,    vr1,   vr0 //0 1
+    vilvl.h          vr1,    vr2,   vr1 //1 2
+    vilvl.b          vr0,    vr1,   vr0 //01 12
+    vilvl.h          vr2,    vr3,   vr2 //2 3
+    vilvl.h          vr3,    vr4,   vr3 //3 4
+    vilvl.b          vr1,    vr3,   vr2 //23 34
+    vilvl.h          vr2,    vr5,   vr4 //4 5
+    vilvl.h          vr3,    vr6,   vr5 //5 6
+    vilvl.b          vr2,    vr3,   vr2 //45 56
 .l_\lable\()put_v_2w_loop:
-    fld.s            f7,     a2,    0  //h0
-    fldx.s           f10,    a2,    a3 //h1
+    fld.s            f7,     a2,    0
+    vilvl.h          vr3,    vr7,   vr6 //6 7
+    fldx.s           f6,     a2,    a3
     add.d            a2,     a2,    t2
+    vilvl.h          vr4,    vr6,   vr7 //7 8
+    vilvl.b          vr3,    vr4,   vr3 //67 78
 
-    vextrins.b       vr0,    vr7,   0x70
-    vextrins.b       vr0,    vr7,   0xf1
-    vbsrl.v          vr1,    vr0,   1
-    vextrins.b       vr1,    vr10,  0x70
-    vextrins.b       vr1,    vr10,  0xf1
-    vdp2.h.bu.b      vr10,   vr0,   vr8
-    vdp2.h.bu.b      vr11,   vr1,   vr8
-    vbsrl.v          vr0,    vr1,   1
-    vhaddw.d.h       vr10
-    vhaddw.d.h       vr11
-    vpickev.w        vr10,   vr11,  vr10
-    vssrarni.hu.w    vr10,   vr10,  6
-    vssrani.bu.h     vr10,   vr10,  0
+    vmulwev.h.bu.b   vr12,   vr0,   vr8
+    vmulwev.h.bu.b   vr13,   vr1,   vr9
+    vmulwev.h.bu.b   vr14,   vr2,   vr10
+    vmulwev.h.bu.b   vr15,   vr3,   vr11
+    vmaddwod.h.bu.b  vr12,   vr0,   vr8
+    vmaddwod.h.bu.b  vr13,   vr1,   vr9
+    vmaddwod.h.bu.b  vr14,   vr2,   vr10
+    vmaddwod.h.bu.b  vr15,   vr3,   vr11
+    vaddi.hu         vr0,    vr1,   0
+    vaddi.hu         vr1,    vr2,   0
+    vaddi.hu         vr2,    vr3,   0
+    vadd.h           vr12,   vr12,  vr13
+    vadd.h           vr12,   vr12,  vr14
+    vadd.h           vr12,   vr12,  vr15
 
-    vstelm.h         vr10,   a0,    0,   0
+    vssrarni.bu.h    vr12,   vr12,  6
+    vstelm.h         vr12,   a0,    0,   0
     add.d            a0,     a0,    a1
-    vstelm.h         vr10,   a0,    0,   1
+    vstelm.h         vr12,   a0,    0,   1
     add.d            a0,     a0,    a1
     addi.w           a5,     a5,    -2
     bnez             a5,     .l_\lable\()put_v_2w_loop
@@ -3140,50 +3089,43 @@
     fldx.s           f6,     a2,    t3
     add.d            a2,     a2,    t4
 
+    vilvl.w          vr0,    vr1,   vr0
+    vilvl.w          vr1,    vr2,   vr1
     vilvl.b          vr0,    vr1,   vr0
-    vilvl.b          vr1,    vr3,   vr2
-    vilvl.b          vr2,    vr5,   vr4
-    vilvl.b          vr3,    vr7,   vr6
-    vilvl.h          vr0,    vr1,   vr0
-    vilvl.h          vr1,    vr3,   vr2
-    vilvl.w          vr2,    vr1,   vr0
-    vilvh.w          vr3,    vr1,   vr0
-
+    vilvl.w          vr1,    vr3,   vr2
+    vilvl.w          vr2,    vr4,   vr3
+    vilvl.b          vr1,    vr2,   vr1
+    vilvl.w          vr2,    vr5,   vr4
+    vilvl.w          vr3,    vr6,   vr5
+    vilvl.b          vr2,    vr3,   vr2
 .l_\lable\()put_v_4w_loop:
     fld.s            f7,     a2,    0
-    fldx.s           f10,    a2,    a3
+
+    vilvl.w          vr3,    vr7,   vr6
+    fldx.s           f6,     a2,    a3
     add.d            a2,     a2,    t2
+    vilvl.w          vr4,    vr6,   vr7
+    vilvl.b          vr3,    vr4,   vr3
 
-    vextrins.b       vr2,    vr7,   0x70
-    vextrins.b       vr2,    vr7,   0xf1 //x0x1(h0)
-    vbsrl.v          vr4,    vr2,   1
-    vextrins.b       vr4,    vr10,  0x70
-    vextrins.b       vr4,    vr10,  0xf1 //x0x1(h1)
-    vdp2.h.bu.b      vr11,   vr2,   vr8
-    vdp2.h.bu.b      vr12,   vr4,   vr8
-    vbsrl.v          vr2,    vr4,   1
+    vmulwev.h.bu.b   vr12,   vr0,   vr8
+    vmulwev.h.bu.b   vr13,   vr1,   vr9
+    vmulwev.h.bu.b   vr14,   vr2,   vr10
+    vmulwev.h.bu.b   vr15,   vr3,   vr11
+    vmaddwod.h.bu.b  vr12,   vr0,   vr8
+    vmaddwod.h.bu.b  vr13,   vr1,   vr9
+    vmaddwod.h.bu.b  vr14,   vr2,   vr10
+    vmaddwod.h.bu.b  vr15,   vr3,   vr11
+    vaddi.hu         vr0,    vr1,   0
+    vaddi.hu         vr1,    vr2,   0
+    vaddi.hu         vr2,    vr3,   0
+    vadd.h           vr12,   vr12,  vr13
+    vadd.h           vr12,   vr12,  vr14
+    vadd.h           vr12,   vr12,  vr15
 
-    vextrins.b       vr3,    vr7,   0x72
-    vextrins.b       vr3,    vr7,   0xf3 //x2x3(h0)
-    vbsrl.v          vr4,    vr3,   1
-    vextrins.b       vr4,    vr10,  0x72
-    vextrins.b       vr4,    vr10,  0xf3 //x2x3(h1)
-    vdp2.h.bu.b      vr13,   vr3,   vr8
-    vdp2.h.bu.b      vr14,   vr4,   vr8
-    vbsrl.v          vr3,    vr4,   1
-
-    vhaddw.d.h       vr11
-    vhaddw.d.h       vr12
-    vhaddw.d.h       vr13
-    vhaddw.d.h       vr14
-
-    vpickev.w        vr11,   vr13,  vr11
-    vpickev.w        vr12,   vr14,  vr12
-    vpickev.h        vr11,   vr12,  vr11
-    vssrarni.bu.h    vr11,   vr11,  6
-    vstelm.w         vr11,   a0,    0,   0
+    vssrarni.bu.h    vr12,   vr12,  6
+    vstelm.w         vr12,   a0,    0,   0
     add.d            a0,     a0,    a1
-    vstelm.w         vr11,   a0,    0,   1
+    vstelm.w         vr12,   a0,    0,   1
     add.d            a0,     a0,    a1
     addi.w           a5,     a5,    -2
     bnez             a5,     .l_\lable\()put_v_4w_loop
@@ -3208,76 +3150,54 @@
     fldx.d           f6,     a2,    t3
     add.d            a2,     a2,    t4
 
-    vilvl.b          vr0,    vr1,   vr0
-    vilvl.b          vr1,    vr3,   vr2
-    vilvl.b          vr2,    vr5,   vr4
-    vilvl.b          vr3,    vr7,   vr6
-    vilvl.h          vr4,    vr1,   vr0
-    vilvh.h          vr5,    vr1,   vr0
-    vilvl.h          vr6,    vr3,   vr2
-    vilvh.h          vr7,    vr3,   vr2
-    vilvl.w          vr0,    vr6,   vr4 // x0x1
-    vilvh.w          vr1,    vr6,   vr4 // x2x3
-    vilvl.w          vr2,    vr7,   vr5 // x4x5
-    vilvh.w          vr3,    vr7,   vr5 // x6x7
+    vilvl.b          vr0,    vr1,   vr0 //0 1
+    vilvl.b          vr1,    vr2,   vr1 //1 2
+    vilvl.b          vr2,    vr3,   vr2 //2 3
+    vilvl.b          vr3,    vr4,   vr3 //3 4
+    vilvl.b          vr4,    vr5,   vr4 //4 5
+    vilvl.b          vr5,    vr6,   vr5 //5 6
 .l_\lable\()put_v_8w_loop:
     fld.d            f7,     a2,    0
-    fldx.d           f10,    a2,    a3
+    vilvl.b          vr12,   vr7,   vr6 //6 7
+    fldx.d           f6,     a2,    a3
     add.d            a2,     a2,    t2
-    //h0
-    vextrins.b       vr0,    vr7,   0x70
-    vextrins.b       vr0,    vr7,   0xf1
-    vextrins.b       vr1,    vr7,   0x72
-    vextrins.b       vr1,    vr7,   0xf3
-    vextrins.b       vr2,    vr7,   0x74
-    vextrins.b       vr2,    vr7,   0xf5
-    vextrins.b       vr3,    vr7,   0x76
-    vextrins.b       vr3,    vr7,   0xf7
-    vdp2.h.bu.b      vr11,   vr0,   vr8
-    vdp2.h.bu.b      vr12,   vr1,   vr8
-    vdp2.h.bu.b      vr13,   vr2,   vr8
-    vdp2.h.bu.b      vr14,   vr3,   vr8
-    vhaddw.d.h       vr11
-    vhaddw.d.h       vr12
-    vhaddw.d.h       vr13
-    vhaddw.d.h       vr14
-    vpickev.w        vr11,   vr12,  vr11
-    vpickev.w        vr12,   vr14,  vr13
-    vpickev.h        vr11,   vr12,  vr11
-    vssrarni.bu.h    vr11,   vr11,  6
-    fst.d            f11,    a0,    0
+    vilvl.b          vr13,   vr6,   vr7 //7 8
+
+    vmulwev.h.bu.b   vr14,   vr0,   vr8
+    vmulwev.h.bu.b   vr15,   vr1,   vr8
+    vmulwev.h.bu.b   vr16,   vr2,   vr9
+    vmulwev.h.bu.b   vr17,   vr3,   vr9
+    vmulwev.h.bu.b   vr18,   vr4,   vr10
+    vmulwev.h.bu.b   vr19,   vr5,   vr10
+    vmulwev.h.bu.b   vr20,   vr12,  vr11
+    vmulwev.h.bu.b   vr21,   vr13,  vr11
+    vmaddwod.h.bu.b  vr14,   vr0,   vr8
+    vmaddwod.h.bu.b  vr15,   vr1,   vr8
+    vmaddwod.h.bu.b  vr16,   vr2,   vr9
+    vmaddwod.h.bu.b  vr17,   vr3,   vr9
+    vmaddwod.h.bu.b  vr18,   vr4,   vr10
+    vmaddwod.h.bu.b  vr19,   vr5,   vr10
+    vmaddwod.h.bu.b  vr20,   vr12,  vr11
+    vmaddwod.h.bu.b  vr21,   vr13,  vr11
+
+    vaddi.hu         vr0,    vr2,   0
+    vaddi.hu         vr1,    vr3,   0
+    vaddi.hu         vr2,    vr4,   0
+    vaddi.hu         vr3,    vr5,   0
+    vaddi.hu         vr4,    vr12,  0
+    vaddi.hu         vr5,    vr13,  0
+    vadd.h           vr14,   vr14,  vr16
+    vadd.h           vr14,   vr14,  vr18
+    vadd.h           vr14,   vr14,  vr20
+    vadd.h           vr15,   vr15,  vr17
+    vadd.h           vr15,   vr15,  vr19
+    vadd.h           vr15,   vr15,  vr21
+
+    vssrarni.bu.h    vr15,   vr14,  6
+    vstelm.d         vr15,   a0,    0,   0
     add.d            a0,     a0,    a1
-    //h1
-    vbsrl.v          vr0,    vr0,   1
-    vbsrl.v          vr1,    vr1,   1
-    vbsrl.v          vr2,    vr2,   1
-    vbsrl.v          vr3,    vr3,   1
-    vextrins.b       vr0,    vr10,  0x70
-    vextrins.b       vr0,    vr10,  0xf1
-    vextrins.b       vr1,    vr10,  0x72
-    vextrins.b       vr1,    vr10,  0xf3
-    vextrins.b       vr2,    vr10,  0x74
-    vextrins.b       vr2,    vr10,  0xf5
-    vextrins.b       vr3,    vr10,  0x76
-    vextrins.b       vr3,    vr10,  0xf7
-    vdp2.h.bu.b      vr11,   vr0,   vr8
-    vdp2.h.bu.b      vr12,   vr1,   vr8
-    vdp2.h.bu.b      vr13,   vr2,   vr8
-    vdp2.h.bu.b      vr14,   vr3,   vr8
-    vhaddw.d.h       vr11
-    vhaddw.d.h       vr12
-    vhaddw.d.h       vr13
-    vhaddw.d.h       vr14
-    vpickev.w        vr11,   vr12,  vr11
-    vpickev.w        vr12,   vr14,  vr13
-    vpickev.h        vr11,   vr12,  vr11
-    vssrarni.bu.h    vr11,   vr11,  6
-    fst.d            f11,    a0,    0
+    vstelm.d         vr15,   a0,    0,   1
     add.d            a0,     a0,    a1
-    vbsrl.v          vr0,    vr0,   1
-    vbsrl.v          vr1,    vr1,   1
-    vbsrl.v          vr2,    vr2,   1
-    vbsrl.v          vr3,    vr3,   1
     addi.w           a5,     a5,    -2
     bnez             a5,     .l_\lable\()put_v_8w_loop
     addi.d           a2,     t0,    8
@@ -3341,6 +3261,7 @@
     .dword .l_\lable\()put_hv_2w   - .l_\lable\()put_hv_jtable
 
 .l_\lable\()put_hv_2w:
+    addi.d           a2,     a2,    2
     vld              vr0,    a2,    0
     vldx             vr1,    a2,    a3
     vldx             vr2,    a2,    t2
@@ -3351,86 +3272,71 @@
     vldx             vr6,    a2,    t3
     add.d            a2,     a2,    t4
 
-    vbsrl.v          vr10,   vr0,   1
-    vbsrl.v          vr11,   vr1,   1
-    vbsrl.v          vr12,   vr2,   1
-    vbsrl.v          vr13,   vr3,   1
-    vbsrl.v          vr14,   vr4,   1
-    vbsrl.v          vr15,   vr5,   1
-    vbsrl.v          vr16,   vr6,   1
-    vilvl.d          vr0,    vr10,  vr0
-    vilvl.d          vr1,    vr11,  vr1
-    vilvl.d          vr2,    vr12,  vr2
-    vilvl.d          vr3,    vr13,  vr3
-    vilvl.d          vr4,    vr14,  vr4
-    vilvl.d          vr5,    vr15,  vr5
-    vilvl.d          vr6,    vr16,  vr6
-    vdp2.h.bu.b      vr10,   vr0,   vr8
-    vdp2.h.bu.b      vr11,   vr1,   vr8
-    vdp2.h.bu.b      vr12,   vr2,   vr8
-    vdp2.h.bu.b      vr13,   vr3,   vr8
-    vdp2.h.bu.b      vr14,   vr4,   vr8
-    vdp2.h.bu.b      vr15,   vr5,   vr8
-    vdp2.h.bu.b      vr16,   vr6,   vr8
-    vhaddw.d.h       vr10
-    vhaddw.d.h       vr11
-    vhaddw.d.h       vr12
-    vhaddw.d.h       vr13
-    vhaddw.d.h       vr14
-    vhaddw.d.h       vr15
-    vhaddw.d.h       vr16
+    la.local         t1,     subpel_h_shuf0
+    vld              vr7,    t1,    0
+    vbsrl.v          vr8,    vr8,   2
+    vreplvei.w       vr8,    vr8,   0
 
-    vpackev.w        vr10,   vr11,  vr10
-    vpackev.w        vr12,   vr13,  vr12
-    vpackod.d        vr11,   vr12,  vr10
-    vpackev.d        vr10,   vr12,  vr10
+    //fv
+    vreplvei.w       vr14,   vr9,   1
+    vreplvei.w       vr15,   vr9,   2
+    vreplvei.w       vr16,   vr9,   3
+    vreplvei.w       vr9,    vr9,   0
 
-    vpackev.w        vr12,   vr15,  vr14
-    vpackev.w        vr16,   vr17,  vr16
-    vpackod.d        vr13,   vr16,  vr12
-    vpackev.d        vr12,   vr16,  vr12
-
-    vpickev.h        vr10,   vr12,  vr10 //0 1 2  3  4  5  6  * (h0)
-    vpickev.h        vr11,   vr13,  vr11 //8 9 10 11 12 13 14 * (h1)
-    vsrari.h         vr10,   vr10,  2
-    vsrari.h         vr11,   vr11,  2
+    vshuf.b          vr0,    vr1,   vr0,  vr7
+    vshuf.b          vr1,    vr3,   vr2,  vr7
+    vshuf.b          vr2,    vr5,   vr4,  vr7
+    vshuf.b          vr3,    vr6,   vr6,  vr7
+    vmulwev.h.bu.b   vr10,   vr0,   vr8
+    vmulwev.h.bu.b   vr11,   vr1,   vr8
+    vmulwev.h.bu.b   vr12,   vr2,   vr8
+    vmulwev.h.bu.b   vr13,   vr3,   vr8
+    vmaddwod.h.bu.b  vr10,   vr0,   vr8
+    vmaddwod.h.bu.b  vr11,   vr1,   vr8
+    vmaddwod.h.bu.b  vr12,   vr2,   vr8
+    vmaddwod.h.bu.b  vr13,   vr3,   vr8
+    vhaddw.w.h       vr0,    vr10,  vr10
+    vhaddw.w.h       vr1,    vr11,  vr11
+    vssrarni.h.w     vr1,    vr0,   2 //h0 h1 h2 h3
+    vhaddw.w.h       vr2,    vr12,  vr12
+    vhaddw.w.h       vr3,    vr13,  vr13
+    vssrarni.h.w     vr3,    vr2,   2 //h4 h5 h6 ~
+    vbsrl.v          vr2,    vr1,   4
+    vextrins.w       vr2,    vr3,   0x30 //h1 h2 h3 h4
+    vilvl.h          vr4,    vr2,   vr1 //h0 h1 h1 h2 --
+    vilvh.h          vr5,    vr2,   vr1 //h2 h3 h3 h4 --
+    vbsrl.v          vr6,    vr3,   4
+    vilvl.h          vr6,    vr6,   vr3 //h4 h5 h5 h6 --
+    vbsrl.v          vr3,    vr3,   8  //h6 ~
 .l_\lable\()put_hv_2w_loop:
-    vld              vr7,    a2,    0
-    vldx             vr12,   a2,    a3
+    vld              vr0,    a2,    0
+    vldx             vr2,    a2,    a3
     add.d            a2,     a2,    t2
+    vshuf.b          vr0,    vr2,   vr0,  vr7
+    vdp2.h.bu.b      vr17,   vr0,   vr8
+    vhaddw.w.h       vr17,   vr17,  vr17
+    vssrarni.h.w     vr17,   vr17,  2 //h7 h8
+    vextrins.w       vr3,    vr17,  0x10 //h6 h7
+    vilvl.h          vr3,    vr17,  vr3  //h6 h7 h7 h8 --
 
-    vbsrl.v          vr1,    vr7,   1
-    vbsrl.v          vr2,    vr12,  1
-    vilvl.d          vr0,    vr1,   vr7
-    vilvl.d          vr1,    vr2,   vr12
-    vdp2.h.bu.b      vr2,    vr0,   vr8
-    vdp2.h.bu.b      vr3,    vr1,   vr8
-    vhaddw.d.h       vr2
-    vhaddw.d.h       vr3
-    vpickev.w        vr2,    vr3,   vr2
-    vpickev.h        vr2,    vr2,   vr2
-    vsrari.h         vr2,    vr2,   2
-    vextrins.h       vr10,   vr2,   0x70 //0 1 2 3 4 5 6 7
-    vextrins.h       vr11,   vr2,   0x71
-    vbsrl.v          vr12,   vr10,  2
-    vbsrl.v          vr13,   vr11,  2
-    vextrins.h       vr12,   vr2,   0x72 //1 2 3 4 5 6 7 8
-    vextrins.h       vr13,   vr2,   0x73
-    vdp2.w.h         vr0,    vr10,  vr9
-    vdp2.w.h         vr1,    vr11,  vr9
-    vdp2.w.h         vr2,    vr12,  vr9
-    vdp2.w.h         vr3,    vr13,  vr9
-    vhaddw.q.w       vr0
-    vhaddw.q.w       vr1
-    vhaddw.q.w       vr2
-    vhaddw.q.w       vr3
-    vpackev.w        vr0,    vr1,   vr0
-    vpackev.w        vr1,    vr3,   vr2
-    vpackev.d        vr0,    vr1,   vr0
-    vssrarni.hu.w    vr0,    vr0,   10
+    vmulwev.w.h      vr18,   vr4,   vr9
+    vmulwev.w.h      vr19,   vr5,   vr14
+    vmulwev.w.h      vr20,   vr6,   vr15
+    vmulwev.w.h      vr21,   vr3,   vr16
+    vmaddwod.w.h     vr18,   vr4,   vr9
+    vmaddwod.w.h     vr19,   vr5,   vr14
+    vmaddwod.w.h     vr20,   vr6,   vr15
+    vmaddwod.w.h     vr21,   vr3,   vr16
+    vaddi.hu         vr4,    vr5,   0
+    vaddi.hu         vr5,    vr6,   0
+    vaddi.hu         vr6,    vr3,   0
+    vbsrl.v          vr3,    vr17,  4 //h8 ~
+    vadd.w           vr18,   vr18,  vr19
+    vadd.w           vr18,   vr18,  vr20
+    vadd.w           vr18,   vr18,  vr21
+
+    vssrarni.hu.w    vr0,    vr18,  10
     vssrani.bu.h     vr0,    vr0,   0
-    vbsrl.v          vr10,   vr12,  2
-    vbsrl.v          vr11,   vr13,  2
     vstelm.h         vr0,    a0,    0,   0
     add.d            a0,     a0,    a1
     vstelm.h         vr0,    a0,    0,   1
@@ -3440,6 +3346,7 @@
     b                .l_\lable\()end_put_8tap
 
 .l_\lable\()put_hv_4w:
+    addi.d           a2,     a2,    2 //ignore leading 0
     vld              vr0,    a2,    0
     vldx             vr1,    a2,    a3
     vldx             vr2,    a2,    t2
@@ -3449,81 +3356,125 @@
     vldx             vr5,    a2,    t2
     vldx             vr6,    a2,    t3
     add.d            a2,     a2,    t4
-    FILTER_8TAP_4W   vr0 //x0 x1 x2 x3
-    FILTER_8TAP_4W   vr1
-    FILTER_8TAP_4W   vr2
-    FILTER_8TAP_4W   vr3
-    FILTER_8TAP_4W   vr4
-    FILTER_8TAP_4W   vr5
-    FILTER_8TAP_4W   vr6
-    vpackev.h        vr0,    vr1,   vr0
-    vpackev.h        vr1,    vr3,   vr2
-    vpackev.h        vr2,    vr5,   vr4
-    vpackev.h        vr3,    vr7,   vr6
-    vilvl.w          vr4,    vr1,   vr0
-    vilvh.w          vr5,    vr1,   vr0
-    vilvl.w          vr6,    vr3,   vr2
-    vilvh.w          vr7,    vr3,   vr2
-    vilvl.d          vr0,    vr6,   vr4 //0 1 2 3 4 5 6 *
-    vilvh.d          vr1,    vr6,   vr4
-    vilvl.d          vr2,    vr7,   vr5
-    vilvh.d          vr3,    vr7,   vr5
-    vsrari.h         vr0,    vr0,   2
-    vsrari.h         vr1,    vr1,   2
-    vsrari.h         vr2,    vr2,   2
-    vsrari.h         vr3,    vr3,   2
+
+    la.local         t1,     subpel_h_shuf1
+    vld              vr7,    t1,    0
+    vbsrl.v          vr8,    vr8,   2
+    vreplvei.w       vr8,    vr8,   0
+
+    //fv
+    vreplvei.w       vr17,   vr9,   0
+    vreplvei.w       vr18,   vr9,   1
+    vreplvei.w       vr19,   vr9,   2
+    vreplvei.w       vr20,   vr9,   3
+
+    //DAV1D_FILTER_8TAP_RND
+    vshuf.b          vr0,    vr0,   vr0,  vr7
+    vshuf.b          vr1,    vr1,   vr1,  vr7
+    vshuf.b          vr2,    vr2,   vr2,  vr7
+    vshuf.b          vr3,    vr3,   vr3,  vr7
+    vshuf.b          vr4,    vr4,   vr4,  vr7
+    vshuf.b          vr5,    vr5,   vr5,  vr7
+    vshuf.b          vr6,    vr6,   vr6,  vr7
+
+    vmulwev.h.bu.b   vr10,   vr0,   vr8
+    vmulwev.h.bu.b   vr11,   vr1,   vr8
+    vmulwev.h.bu.b   vr12,   vr2,   vr8
+    vmulwev.h.bu.b   vr13,   vr3,   vr8
+    vmulwev.h.bu.b   vr14,   vr4,   vr8
+    vmulwev.h.bu.b   vr15,   vr5,   vr8
+    vmulwev.h.bu.b   vr16,   vr6,   vr8
+    vmaddwod.h.bu.b  vr10,   vr0,   vr8
+    vmaddwod.h.bu.b  vr11,   vr1,   vr8
+    vmaddwod.h.bu.b  vr12,   vr2,   vr8
+    vmaddwod.h.bu.b  vr13,   vr3,   vr8
+    vmaddwod.h.bu.b  vr14,   vr4,   vr8
+    vmaddwod.h.bu.b  vr15,   vr5,   vr8
+    vmaddwod.h.bu.b  vr16,   vr6,   vr8
+
+    vhaddw.w.h       vr10,   vr10,  vr10
+    vhaddw.w.h       vr11,   vr11,  vr11
+    vhaddw.w.h       vr12,   vr12,  vr12
+    vhaddw.w.h       vr13,   vr13,  vr13
+    vhaddw.w.h       vr14,   vr14,  vr14
+    vhaddw.w.h       vr15,   vr15,  vr15
+    vhaddw.w.h       vr16,   vr16,  vr16
+
+    vssrarni.h.w     vr10,   vr10,  2 //h0
+    vssrarni.h.w     vr11,   vr11,  2 //h1
+    vssrarni.h.w     vr12,   vr12,  2 //h2
+    vssrarni.h.w     vr13,   vr13,  2 //h3
+    vssrarni.h.w     vr14,   vr14,  2 //h4
+    vssrarni.h.w     vr15,   vr15,  2 //h5
+    vssrarni.h.w     vr16,   vr16,  2 //h6
+
+    //h0
+    vilvl.h          vr0,    vr11,  vr10 //01
+    vilvl.h          vr1,    vr13,  vr12 //23
+    vilvl.h          vr2,    vr15,  vr14 //45
+    //h1
+    vilvl.h          vr4,    vr12,  vr11 //12
+    vilvl.h          vr5,    vr14,  vr13 //34
+    vilvl.h          vr6,    vr16,  vr15 //56
+
 .l_\lable\()put_hv_4w_loop:
-    vld              vr4,    a2,    0
-    vldx             vr5,    a2,    a3
+    vld              vr9,    a2,    0
+    vldx             vr10,   a2,    a3
     add.d            a2,     a2,    t2
-    FILTER_8TAP_4W   vr4
-    FILTER_8TAP_4W   vr5
-    vpickev.h        vr4,    vr5,   vr4
-    vsrari.h         vr4,    vr4,   2
-    vextrins.h       vr0,    vr4,   0x70
-    vextrins.h       vr1,    vr4,   0x71
-    vextrins.h       vr2,    vr4,   0x72
-    vextrins.h       vr3,    vr4,   0x73
-    vbsrl.v          vr5,    vr0,   2
-    vbsrl.v          vr6,    vr1,   2
-    vbsrl.v          vr7,    vr2,   2
-    vbsrl.v          vr10,   vr3,   2
-    vextrins.h       vr5,    vr4,   0x74
-    vextrins.h       vr6,    vr4,   0x75
-    vextrins.h       vr7,    vr4,   0x76
-    vextrins.h       vr10,   vr4,   0x77
-    vdp2.w.h         vr11,   vr0,   vr9
-    vdp2.w.h         vr12,   vr1,   vr9
-    vdp2.w.h         vr13,   vr2,   vr9
-    vdp2.w.h         vr14,   vr3,   vr9
-    vhaddw.q.w       vr11
-    vhaddw.q.w       vr12
-    vhaddw.q.w       vr13
-    vhaddw.q.w       vr14
-    vpackev.w        vr0,    vr12,  vr11
-    vpackev.w        vr1,    vr14,  vr13
-    vpackev.d        vr0,    vr1,   vr0
-    vdp2.w.h         vr11,   vr5,   vr9
-    vdp2.w.h         vr12,   vr6,   vr9
-    vdp2.w.h         vr13,   vr7,   vr9
-    vdp2.w.h         vr14,   vr10,  vr9
-    vhaddw.q.w       vr11
-    vhaddw.q.w       vr12
-    vhaddw.q.w       vr13
-    vhaddw.q.w       vr14
-    vpackev.w        vr1,    vr12,  vr11
-    vpackev.w        vr2,    vr14,  vr13
-    vpackev.d        vr1,    vr2,   vr1
-    vssrarni.hu.w    vr1,    vr0,   10
-    vssrani.bu.h     vr1,    vr1,   0
-    vstelm.w         vr1,    a0,    0,    0
+
+    //DAV1D_FILTER_8TAP_CLIP
+    vshuf.b          vr9,    vr9,   vr9,  vr7
+    vshuf.b          vr10,   vr10,  vr10, vr7
+    vmulwev.h.bu.b   vr11,   vr9,   vr8
+    vmulwev.h.bu.b   vr12,   vr10,  vr8
+    vmaddwod.h.bu.b  vr11,   vr9,   vr8
+    vmaddwod.h.bu.b  vr12,   vr10,  vr8
+    vhaddw.w.h       vr11,   vr11,  vr11
+    vhaddw.w.h       vr12,   vr12,  vr12
+    vssrarni.h.w     vr11,   vr11,  2 //h7
+    vssrarni.h.w     vr12,   vr12,  2 //h8
+    vilvl.h          vr3,    vr11,  vr16 //67
+    vilvl.h          vr13,   vr12,  vr11 //78
+
+    vmulwev.w.h      vr9,    vr0,   vr17
+    vmulwev.w.h      vr10,   vr1,   vr18
+    vmulwev.w.h      vr14,   vr2,   vr19
+    vmulwev.w.h      vr15,   vr3,   vr20
+    vmaddwod.w.h     vr9,    vr0,   vr17
+    vmaddwod.w.h     vr10,   vr1,   vr18
+    vmaddwod.w.h     vr14,   vr2,   vr19
+    vmaddwod.w.h     vr15,   vr3,   vr20
+    vadd.w           vr16,   vr9,   vr10
+    vadd.w           vr16,   vr16,  vr14
+    vadd.w           vr16,   vr16,  vr15
+
+    vmulwev.w.h      vr9,    vr4,   vr17
+    vmulwev.w.h      vr10,   vr5,   vr18
+    vmulwev.w.h      vr14,   vr6,   vr19
+    vmulwev.w.h      vr15,   vr13,  vr20
+    vmaddwod.w.h     vr9,    vr4,   vr17
+    vmaddwod.w.h     vr10,   vr5,   vr18
+    vmaddwod.w.h     vr14,   vr6,   vr19
+    vmaddwod.w.h     vr15,   vr13,  vr20
+    vadd.w           vr21,   vr9,   vr10
+    vadd.w           vr21,   vr21,  vr14
+    vadd.w           vr21,   vr21,  vr15
+
+    vssrarni.hu.w    vr21,   vr16,  10
+    vssrani.bu.h     vr21,   vr21,  0
+    //cache
+    vaddi.hu         vr0,    vr1,   0
+    vaddi.hu         vr1,    vr2,   0
+    vaddi.hu         vr2,    vr3,   0
+    vaddi.hu         vr4,    vr5,   0
+    vaddi.hu         vr5,    vr6,   0
+    vaddi.hu         vr6,    vr13,  0
+    vaddi.hu         vr16,   vr12,  0
+
+    vstelm.w         vr21,   a0,    0,    0
     add.d            a0,     a0,    a1
-    vstelm.w         vr1,    a0,    0,    1
+    vstelm.w         vr21,   a0,    0,    1
     add.d            a0,     a0,    a1
-    vbsrl.v          vr0,    vr5,   2
-    vbsrl.v          vr1,    vr6,   2
-    vbsrl.v          vr2,    vr7,   2
-    vbsrl.v          vr3,    vr10,  2
     addi.w           a5,     a5,    -2
     bnez             a5,     .l_\lable\()put_hv_4w_loop
     b                .l_\lable\()end_put_8tap
@@ -3533,9 +3484,28 @@
 .l_\lable\()put_hv_32w:
 .l_\lable\()put_hv_64w:
 .l_\lable\()put_hv_128w:
+    addi.d          sp,      sp,    -8*8
+    fst.d           f24,     sp,    0
+    fst.d           f25,     sp,    8
+    fst.d           f26,     sp,    16
+    fst.d           f27,     sp,    24
+    fst.d           f28,     sp,    32
+    fst.d           f29,     sp,    40
+    fst.d           f30,     sp,    48
+    fst.d           f31,     sp,    56
     addi.d          t0,      a2,    0 //src
     addi.d          t5,      a5,    0 //h
     addi.d          t8,      a0,    0 //dst
+    la.local        t1,      subpel_h_shuf1
+    vld             vr7,     t1,    0
+    vaddi.bu        vr11,    vr7,   4
+    vaddi.bu        vr12,    vr7,   8
+    vreplvei.w      vr10,    vr8,   1
+    vreplvei.w      vr8,     vr8,   0
+    vreplvei.w      vr20,    vr9,   1
+    vreplvei.w      vr21,    vr9,   2
+    vreplvei.w      vr22,    vr9,   3
+    vreplvei.w      vr9,     vr9,   0
 .l_\lable\()put_hv_8w_loop0:
     vld             vr0,     a2,    0
     vldx            vr1,     a2,    a3
@@ -3546,28 +3516,123 @@
     vldx            vr5,     a2,    t2
     vldx            vr6,     a2,    t3
     add.d           a2,      a2,    t4
-    FILTER_8TAP_8W  vr0
-    FILTER_8TAP_8W  vr1
-    FILTER_8TAP_8W  vr2
-    FILTER_8TAP_8W  vr3
-    FILTER_8TAP_8W  vr4
-    FILTER_8TAP_8W  vr5
-    FILTER_8TAP_8W  vr6
-    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
-                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
-                       vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17
+
+    FILTER_8TAP_8W  vr0 //h0
+    FILTER_8TAP_8W  vr1 //h1
+    FILTER_8TAP_8W  vr2 //h2
+    FILTER_8TAP_8W  vr3 //h3
+    FILTER_8TAP_8W  vr4 //h4
+    FILTER_8TAP_8W  vr5 //h5
+    FILTER_8TAP_8W  vr6 //h6
+
+    //h0' low part
+    vilvl.h         vr23,    vr1,   vr0 //01
+    vilvl.h         vr24,    vr3,   vr2 //23
+    vilvl.h         vr25,    vr5,   vr4 //45
+    //h0' high part
+    vilvh.h         vr26,    vr1,   vr0 //01
+    vilvh.h         vr27,    vr3,   vr2 //23
+    vilvh.h         vr28,    vr5,   vr4 //45
+
+    //h1' low part
+    vilvl.h         vr29,    vr2,   vr1 //12
+    vilvl.h         vr30,    vr4,   vr3 //34
+    vilvl.h         vr31,    vr6,   vr5 //56
+    //h1' high part
+    vilvh.h         vr0,     vr2,   vr1 //12
+    vilvh.h         vr1,     vr4,   vr3 //34
+    vilvh.h         vr2,     vr6,   vr5 //56
+
 .l_\lable\()put_hv_8w_loop:
-    vld             vr20,    a2,    0
-    vldx            vr21,    a2,    a3
+    vld             vr3,     a2,    0
+    vldx            vr4,     a2,    a3
     add.d           a2,      a2,    t2
-    FILTER_8TAP_8W  vr20
-    FILTER_8TAP_8W  vr21
-    VEXTRINS_Hx8    vr20
-    FILTER_8TAP_8W_CLIP_STORE
-    VBSRL_Vx8
-    VEXTRINS_Hx8    vr21
-    FILTER_8TAP_8W_CLIP_STORE
-    VBSRL_Vx8
+
+    FILTER_8TAP_8W  vr3 //h7
+    FILTER_8TAP_8W  vr4 //h8
+
+    //h0' low part
+    vilvl.h         vr16,    vr3,   vr6 //67 ~low
+    vmulwev.w.h     vr13,    vr23,  vr9
+    vmulwev.w.h     vr14,    vr24,  vr20
+    vmulwev.w.h     vr15,    vr25,  vr21
+    vmulwev.w.h     vr17,    vr16,  vr22
+    vmaddwod.w.h    vr13,    vr23,  vr9
+    vmaddwod.w.h    vr14,    vr24,  vr20
+    vmaddwod.w.h    vr15,    vr25,  vr21
+    vmaddwod.w.h    vr17,    vr16,  vr22
+    vadd.w          vr13,    vr13,  vr14
+    vadd.w          vr13,    vr13,  vr15
+    vadd.w          vr13,    vr13,  vr17
+    //cache
+    vaddi.hu        vr23,    vr24,  0
+    vaddi.hu        vr24,    vr25,  0
+    vaddi.hu        vr25,    vr16,  0
+
+    //h0' high part
+    vilvh.h         vr17,    vr3,   vr6 //67 ~high
+    vmulwev.w.h     vr14,    vr26,  vr9
+    vmulwev.w.h     vr15,    vr27,  vr20
+    vmulwev.w.h     vr16,    vr28,  vr21
+    vmulwev.w.h     vr18,    vr17,  vr22
+    vmaddwod.w.h    vr14,    vr26,  vr9
+    vmaddwod.w.h    vr15,    vr27,  vr20
+    vmaddwod.w.h    vr16,    vr28,  vr21
+    vmaddwod.w.h    vr18,    vr17,  vr22
+    vadd.w          vr14,    vr14,  vr15
+    vadd.w          vr14,    vr14,  vr16
+    vadd.w          vr14,    vr14,  vr18
+    vssrarni.hu.w   vr14,    vr13,  10
+    vssrarni.bu.h   vr5,     vr14,  0
+    vstelm.d        vr5,     a0,    0,   0
+    add.d           a0,      a0,    a1
+    //cache
+    vaddi.hu        vr26,    vr27,  0
+    vaddi.hu        vr27,    vr28,  0
+    vaddi.hu        vr28,    vr17,  0
+    vaddi.hu        vr6,     vr4,   0
+
+    vilvl.h         vr5,     vr4,   vr3 //78 ~low
+    vilvh.h         vr4,     vr4,   vr3 //78 ~high
+
+    //h1' low part
+    vmulwev.w.h     vr13,    vr29,  vr9
+    vmulwev.w.h     vr14,    vr30,  vr20
+    vmulwev.w.h     vr15,    vr31,  vr21
+    vmulwev.w.h     vr16,    vr5,   vr22
+    vmaddwod.w.h    vr13,    vr29,  vr9
+    vmaddwod.w.h    vr14,    vr30,  vr20
+    vmaddwod.w.h    vr15,    vr31,  vr21
+    vmaddwod.w.h    vr16,    vr5,   vr22
+    vadd.w          vr13,    vr13,  vr14
+    vadd.w          vr13,    vr13,  vr15
+    vadd.w          vr13,    vr13,  vr16
+    //cache
+    vaddi.hu        vr29,    vr30,  0
+    vaddi.hu        vr30,    vr31,  0
+    vaddi.hu        vr31,    vr5,   0
+
+    //h1' high part
+    vmulwev.w.h     vr14,    vr0,   vr9
+    vmulwev.w.h     vr15,    vr1,   vr20
+    vmulwev.w.h     vr16,    vr2,   vr21
+    vmulwev.w.h     vr17,    vr4,   vr22
+    vmaddwod.w.h    vr14,    vr0,   vr9
+    vmaddwod.w.h    vr15,    vr1,   vr20
+    vmaddwod.w.h    vr16,    vr2,   vr21
+    vmaddwod.w.h    vr17,    vr4,   vr22
+    vadd.w          vr14,    vr14,  vr15
+    vadd.w          vr14,    vr14,  vr16
+    vadd.w          vr14,    vr14,  vr17
+    vssrarni.hu.w   vr14,    vr13,  10
+    vssrarni.bu.h   vr5,     vr14,  0
+    vstelm.d        vr5,     a0,    0,   0
+    add.d           a0,      a0,    a1
+    //cache
+    vaddi.hu        vr0,     vr1,   0
+    vaddi.hu        vr1,     vr2,   0
+    vaddi.hu        vr2,     vr4,   0
+
     addi.w          a5,      a5,    -2
     bnez            a5,      .l_\lable\()put_hv_8w_loop
     addi.d          a2,      t0,    8
@@ -3577,6 +3642,15 @@
     addi.d          a5,      t5,    0
     addi.w          a4,      a4,    -8
     bnez            a4,      .l_\lable\()put_hv_8w_loop0
+    fld.d           f24,     sp,    0
+    fld.d           f25,     sp,    8
+    fld.d           f26,     sp,    16
+    fld.d           f27,     sp,    24
+    fld.d           f28,     sp,    32
+    fld.d           f29,     sp,    40
+    fld.d           f30,     sp,    48
+    fld.d           f31,     sp,    56
+    addi.d          sp,      sp,    8*8
 .l_\lable\()end_put_8tap:
 .endm