Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24.  
  25. %include "libavutil/x86/x86util.asm"
  26.  
  27. SECTION_RODATA 32
  28.  
  29. cextern pw_16
  30. cextern pw_1
  31. cextern pb_0
  32.  
  33. pw_pixel_max: times 8 dw ((1 << 10)-1)
  34.  
  35. pad10: times 8 dw 10*1023
  36. pad20: times 8 dw 20*1023
  37. pad30: times 8 dw 30*1023
  38. depad: times 4 dd 32*20*1023 + 512
  39. depad2: times 8 dw 20*1023 + 16*1022 + 16
  40. unpad: times 8 dw 16*1022/32 ; needs to be mod 16
  41.  
  42. tap1: times 4 dw  1, -5
  43. tap2: times 4 dw 20, 20
  44. tap3: times 4 dw -5,  1
  45. pd_0f: times 4 dd 0xffff
  46.  
  47. SECTION .text
  48.  
  49.  
  50. %macro AVG_MOV 2
  51.     pavgw %2, %1
  52.     mova  %1, %2
  53. %endmacro
  54.  
  55. %macro ADDW 3
  56. %if mmsize == 8
  57.     paddw %1, %2
  58. %else
  59.     movu  %3, %2
  60.     paddw %1, %3
  61. %endif
  62. %endmacro
  63.  
  64. %macro FILT_H 4
  65.     paddw  %1, %4
  66.     psubw  %1, %2  ; a-b
  67.     psraw  %1, 2   ; (a-b)/4
  68.     psubw  %1, %2  ; (a-b)/4-b
  69.     paddw  %1, %3  ; (a-b)/4-b+c
  70.     psraw  %1, 2   ; ((a-b)/4-b+c)/4
  71.     paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  72. %endmacro
  73.  
  74. %macro PRELOAD_V 0
  75.     lea      r3, [r2*3]
  76.     sub      r1, r3
  77.     movu     m0, [r1+r2]
  78.     movu     m1, [r1+r2*2]
  79.     add      r1, r3
  80.     movu     m2, [r1]
  81.     movu     m3, [r1+r2]
  82.     movu     m4, [r1+r2*2]
  83.     add      r1, r3
  84. %endmacro
  85.  
  86. %macro FILT_V 8
  87.     movu     %6, [r1]
  88.     paddw    %1, %6
  89.     mova     %7, %2
  90.     paddw    %7, %5
  91.     mova     %8, %3
  92.     paddw    %8, %4
  93.     FILT_H   %1, %7, %8, [pw_16]
  94.     psraw    %1, 1
  95.     CLIPW    %1, [pb_0], [pw_pixel_max]
  96. %endmacro
  97.  
  98. %macro MC 1
  99. %define OP_MOV mova
  100. INIT_MMX mmxext
  101. %1 put, 4
  102. INIT_XMM sse2
  103. %1 put, 8
  104.  
  105. %define OP_MOV AVG_MOV
  106. INIT_MMX mmxext
  107. %1 avg, 4
  108. INIT_XMM sse2
  109. %1 avg, 8
  110. %endmacro
  111.  
  112. %macro MCAxA_OP 7
  113. %if ARCH_X86_32
  114. cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
  115.     call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  116.     mov  r0, r0m
  117.     mov  r1, r1m
  118.     add  r0, %3*2
  119.     add  r1, %3*2
  120.     call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  121.     mov  r0, r0m
  122.     mov  r1, r1m
  123.     lea  r0, [r0+r2*%3]
  124.     lea  r1, [r1+r2*%3]
  125.     call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  126.     mov  r0, r0m
  127.     mov  r1, r1m
  128.     lea  r0, [r0+r2*%3+%3*2]
  129.     lea  r1, [r1+r2*%3+%3*2]
  130.     call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  131.     RET
  132. %else ; ARCH_X86_64
  133. cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
  134.     mov r%6, r0
  135. %assign p1 %6+1
  136.     mov r %+ p1, r1
  137.     call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  138.     lea  r0, [r%6+%3*2]
  139.     lea  r1, [r %+ p1+%3*2]
  140.     call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  141.     lea  r0, [r%6+r2*%3]
  142.     lea  r1, [r %+ p1+r2*%3]
  143.     call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  144.     lea  r0, [r%6+r2*%3+%3*2]
  145.     lea  r1, [r %+ p1+r2*%3+%3*2]
  146. %if UNIX64 == 0 ; fall through to function
  147.     call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  148.     RET
  149. %endif
  150. %endif
  151. %endmacro
  152.  
  153. ;cpu, put/avg, mc, 4/8, ...
  154. %macro cglobal_mc 6
  155. %assign i %3*2
  156. %if ARCH_X86_32 || cpuflag(sse2)
  157. MCAxA_OP %1, %2, %3, i, %4,%5,%6
  158. %endif
  159.  
  160. cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
  161. %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
  162.     call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  163.     RET
  164. %endif
  165.  
  166. stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
  167. %endmacro
  168.  
  169. ;-----------------------------------------------------------------------------
  170. ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
  171. ;-----------------------------------------------------------------------------
  172. %macro COPY4 0
  173.     movu          m0, [r1     ]
  174.     OP_MOV [r0     ], m0
  175.     movu          m0, [r1+r2  ]
  176.     OP_MOV [r0+r2  ], m0
  177.     movu          m0, [r1+r2*2]
  178.     OP_MOV [r0+r2*2], m0
  179.     movu          m0, [r1+r3  ]
  180.     OP_MOV [r0+r3  ], m0
  181. %endmacro
  182.  
  183. %macro MC00 1
  184. INIT_MMX mmxext
  185. cglobal_mc %1, mc00, 4, 3,4,0
  186.     lea           r3, [r2*3]
  187.     COPY4
  188.     ret
  189.  
  190. INIT_XMM sse2
  191. cglobal %1_h264_qpel8_mc00_10, 3,4
  192.     lea  r3, [r2*3]
  193.     COPY4
  194.     lea  r0, [r0+r2*4]
  195.     lea  r1, [r1+r2*4]
  196.     COPY4
  197.     RET
  198.  
  199. cglobal %1_h264_qpel16_mc00_10, 3,4
  200.     mov r3d, 8
  201. .loop:
  202.     movu           m0, [r1      ]
  203.     movu           m1, [r1   +16]
  204.     OP_MOV [r0      ], m0
  205.     OP_MOV [r0   +16], m1
  206.     movu           m0, [r1+r2   ]
  207.     movu           m1, [r1+r2+16]
  208.     OP_MOV [r0+r2   ], m0
  209.     OP_MOV [r0+r2+16], m1
  210.     lea            r0, [r0+r2*2]
  211.     lea            r1, [r1+r2*2]
  212.     dec r3d
  213.     jg .loop
  214.     REP_RET
  215. %endmacro
  216.  
  217. %define OP_MOV mova
  218. MC00 put
  219.  
  220. %define OP_MOV AVG_MOV
  221. MC00 avg
  222.  
  223. ;-----------------------------------------------------------------------------
  224. ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
  225. ;-----------------------------------------------------------------------------
  226. %macro MC_CACHE 1
  227. %define OP_MOV mova
  228. INIT_MMX mmxext
  229. %1 put, 4
  230. INIT_XMM sse2, cache64
  231. %1 put, 8
  232. INIT_XMM ssse3, cache64
  233. %1 put, 8
  234. INIT_XMM sse2
  235. %1 put, 8
  236.  
  237. %define OP_MOV AVG_MOV
  238. INIT_MMX mmxext
  239. %1 avg, 4
  240. INIT_XMM sse2, cache64
  241. %1 avg, 8
  242. INIT_XMM ssse3, cache64
  243. %1 avg, 8
  244. INIT_XMM sse2
  245. %1 avg, 8
  246. %endmacro
  247.  
  248. %macro MC20 2
  249. cglobal_mc %1, mc20, %2, 3,4,9
  250.     mov     r3d, %2
  251.     mova     m1, [pw_pixel_max]
  252. %if num_mmregs > 8
  253.     mova     m8, [pw_16]
  254.     %define p16 m8
  255. %else
  256.     %define p16 [pw_16]
  257. %endif
  258. .nextrow:
  259. %if %0 == 4
  260.     movu     m2, [r1-4]
  261.     movu     m3, [r1-2]
  262.     movu     m4, [r1+0]
  263.     ADDW     m2, [r1+6], m5
  264.     ADDW     m3, [r1+4], m5
  265.     ADDW     m4, [r1+2], m5
  266. %else ; movu is slow on these processors
  267. %if mmsize==16
  268.     movu     m2, [r1-4]
  269.     movu     m0, [r1+6]
  270.     mova     m6, m0
  271.     psrldq   m0, 6
  272.  
  273.     paddw    m6, m2
  274.     PALIGNR  m3, m0, m2, 2, m5
  275.     PALIGNR  m7, m0, m2, 8, m5
  276.     paddw    m3, m7
  277.     PALIGNR  m4, m0, m2, 4, m5
  278.     PALIGNR  m7, m0, m2, 6, m5
  279.     paddw    m4, m7
  280.     SWAP      2, 6
  281. %else
  282.     movu     m2, [r1-4]
  283.     movu     m6, [r1+4]
  284.     PALIGNR  m3, m6, m2, 2, m5
  285.     paddw    m3, m6
  286.     PALIGNR  m4, m6, m2, 4, m5
  287.     PALIGNR  m7, m6, m2, 6, m5
  288.     paddw    m4, m7
  289.     paddw    m2, [r1+6]
  290. %endif
  291. %endif
  292.  
  293.     FILT_H   m2, m3, m4, p16
  294.     psraw    m2, 1
  295.     pxor     m0, m0
  296.     CLIPW    m2, m0, m1
  297.     OP_MOV [r0], m2
  298.     add      r0, r2
  299.     add      r1, r2
  300.     dec     r3d
  301.     jg .nextrow
  302.     rep ret
  303. %endmacro
  304.  
  305. MC_CACHE MC20
  306.  
  307. ;-----------------------------------------------------------------------------
  308. ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
  309. ;-----------------------------------------------------------------------------
  310. %macro MC30 2
  311. cglobal_mc %1, mc30, %2, 3,5,9
  312.     lea r4, [r1+2]
  313.     jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
  314. %endmacro
  315.  
  316. MC_CACHE MC30
  317.  
  318. ;-----------------------------------------------------------------------------
  319. ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
  320. ;-----------------------------------------------------------------------------
  321. %macro MC10 2
  322. cglobal_mc %1, mc10, %2, 3,5,9
  323.     mov      r4, r1
  324. .body:
  325.     mov     r3d, %2
  326.     mova     m1, [pw_pixel_max]
  327. %if num_mmregs > 8
  328.     mova     m8, [pw_16]
  329.     %define p16 m8
  330. %else
  331.     %define p16 [pw_16]
  332. %endif
  333. .nextrow:
  334. %if %0 == 4
  335.     movu     m2, [r1-4]
  336.     movu     m3, [r1-2]
  337.     movu     m4, [r1+0]
  338.     ADDW     m2, [r1+6], m5
  339.     ADDW     m3, [r1+4], m5
  340.     ADDW     m4, [r1+2], m5
  341. %else ; movu is slow on these processors
  342. %if mmsize==16
  343.     movu     m2, [r1-4]
  344.     movu     m0, [r1+6]
  345.     mova     m6, m0
  346.     psrldq   m0, 6
  347.  
  348.     paddw    m6, m2
  349.     PALIGNR  m3, m0, m2, 2, m5
  350.     PALIGNR  m7, m0, m2, 8, m5
  351.     paddw    m3, m7
  352.     PALIGNR  m4, m0, m2, 4, m5
  353.     PALIGNR  m7, m0, m2, 6, m5
  354.     paddw    m4, m7
  355.     SWAP      2, 6
  356. %else
  357.     movu     m2, [r1-4]
  358.     movu     m6, [r1+4]
  359.     PALIGNR  m3, m6, m2, 2, m5
  360.     paddw    m3, m6
  361.     PALIGNR  m4, m6, m2, 4, m5
  362.     PALIGNR  m7, m6, m2, 6, m5
  363.     paddw    m4, m7
  364.     paddw    m2, [r1+6]
  365. %endif
  366. %endif
  367.  
  368.     FILT_H   m2, m3, m4, p16
  369.     psraw    m2, 1
  370.     pxor     m0, m0
  371.     CLIPW    m2, m0, m1
  372.     movu     m3, [r4]
  373.     pavgw    m2, m3
  374.     OP_MOV [r0], m2
  375.     add      r0, r2
  376.     add      r1, r2
  377.     add      r4, r2
  378.     dec     r3d
  379.     jg .nextrow
  380.     rep ret
  381. %endmacro
  382.  
  383. MC_CACHE MC10
  384.  
  385. ;-----------------------------------------------------------------------------
  386. ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
  387. ;-----------------------------------------------------------------------------
  388. %macro V_FILT 10
  389. v_filt%9_%10_10
  390.     add    r4, r2
  391. .no_addr4:
  392.     FILT_V m0, m1, m2, m3, m4, m5, m6, m7
  393.     add    r1, r2
  394.     add    r0, r2
  395.     ret
  396. %endmacro
  397.  
  398. INIT_MMX mmxext
  399. RESET_MM_PERMUTATION
  400. %assign i 0
  401. %rep 4
  402. V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
  403. SWAP 0,1,2,3,4,5
  404. %assign i i+1
  405. %endrep
  406.  
  407. INIT_XMM sse2
  408. RESET_MM_PERMUTATION
  409. %assign i 0
  410. %rep 6
  411. V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
  412. SWAP 0,1,2,3,4,5
  413. %assign i i+1
  414. %endrep
  415.  
  416. %macro MC02 2
  417. cglobal_mc %1, mc02, %2, 3,4,8
  418.     PRELOAD_V
  419.  
  420.     sub      r0, r2
  421. %assign j 0
  422. %rep %2
  423.     %assign i (j % 6)
  424.     call v_filt%2_ %+ i %+ _10.no_addr4
  425.     OP_MOV [r0], m0
  426.     SWAP 0,1,2,3,4,5
  427.     %assign j j+1
  428. %endrep
  429.     ret
  430. %endmacro
  431.  
  432. MC MC02
  433.  
  434. ;-----------------------------------------------------------------------------
  435. ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
  436. ;-----------------------------------------------------------------------------
  437. %macro MC01 2
  438. cglobal_mc %1, mc01, %2, 3,5,8
  439.     mov      r4, r1
  440. .body:
  441.     PRELOAD_V
  442.  
  443.     sub      r4, r2
  444.     sub      r0, r2
  445. %assign j 0
  446. %rep %2
  447.     %assign i (j % 6)
  448.     call v_filt%2_ %+ i %+ _10
  449.     movu     m7, [r4]
  450.     pavgw    m0, m7
  451.     OP_MOV [r0], m0
  452.     SWAP 0,1,2,3,4,5
  453.     %assign j j+1
  454. %endrep
  455.     ret
  456. %endmacro
  457.  
  458. MC MC01
  459.  
  460. ;-----------------------------------------------------------------------------
  461. ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
  462. ;-----------------------------------------------------------------------------
  463. %macro MC03 2
  464. cglobal_mc %1, mc03, %2, 3,5,8
  465.     lea r4, [r1+r2]
  466.     jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
  467. %endmacro
  468.  
  469. MC MC03
  470.  
  471. ;-----------------------------------------------------------------------------
  472. ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
  473. ;-----------------------------------------------------------------------------
  474. %macro H_FILT_AVG 2-3
  475. h_filt%1_%2_10:
  476. ;FILT_H with fewer registers and averaged with the FILT_V result
  477. ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
  478. ;unfortunately I need three registers, so m5 will have to be re-read from memory
  479.     movu     m5, [r4-4]
  480.     ADDW     m5, [r4+6], m7
  481.     movu     m6, [r4-2]
  482.     ADDW     m6, [r4+4], m7
  483.     paddw    m5, [pw_16]
  484.     psubw    m5, m6  ; a-b
  485.     psraw    m5, 2   ; (a-b)/4
  486.     psubw    m5, m6  ; (a-b)/4-b
  487.     movu     m6, [r4+0]
  488.     ADDW     m6, [r4+2], m7
  489.     paddw    m5, m6  ; (a-b)/4-b+c
  490.     psraw    m5, 2   ; ((a-b)/4-b+c)/4
  491.     paddw    m5, m6  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  492.     psraw    m5, 1
  493.     CLIPW    m5, [pb_0], [pw_pixel_max]
  494. ;avg FILT_V, FILT_H
  495.     pavgw    m0, m5
  496. %if %0!=4
  497.     movu     m5, [r1+r5]
  498. %endif
  499.     ret
  500. %endmacro
  501.  
  502. INIT_MMX mmxext
  503. RESET_MM_PERMUTATION
  504. %assign i 0
  505. %rep 3
  506. H_FILT_AVG 4, i
  507. SWAP 0,1,2,3,4,5
  508. %assign i i+1
  509. %endrep
  510. H_FILT_AVG 4, i, 0
  511.  
  512. INIT_XMM sse2
  513. RESET_MM_PERMUTATION
  514. %assign i 0
  515. %rep 6
  516. %if i==1
  517. H_FILT_AVG 8, i, 0
  518. %else
  519. H_FILT_AVG 8, i
  520. %endif
  521. SWAP 0,1,2,3,4,5
  522. %assign i i+1
  523. %endrep
  524.  
  525. %macro MC11 2
  526. ; this REALLY needs x86_64
  527. cglobal_mc %1, mc11, %2, 3,6,8
  528.     mov      r4, r1
  529. .body:
  530.     PRELOAD_V
  531.  
  532.     sub      r0, r2
  533.     sub      r4, r2
  534.     mov      r5, r2
  535.     neg      r5
  536. %assign j 0
  537. %rep %2
  538.     %assign i (j % 6)
  539.     call v_filt%2_ %+ i %+ _10
  540.     call h_filt%2_ %+ i %+ _10
  541. %if %2==8 && i==1
  542.     movu     m5, [r1+r5]
  543. %endif
  544.     OP_MOV [r0], m0
  545.     SWAP 0,1,2,3,4,5
  546.     %assign j j+1
  547. %endrep
  548.     ret
  549. %endmacro
  550.  
  551. MC MC11
  552.  
  553. ;-----------------------------------------------------------------------------
  554. ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
  555. ;-----------------------------------------------------------------------------
  556. %macro MC31 2
  557. cglobal_mc %1, mc31, %2, 3,6,8
  558.     mov r4, r1
  559.     add r1, 2
  560.     jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
  561. %endmacro
  562.  
  563. MC MC31
  564.  
  565. ;-----------------------------------------------------------------------------
  566. ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
  567. ;-----------------------------------------------------------------------------
  568. %macro MC13 2
  569. cglobal_mc %1, mc13, %2, 3,7,12
  570.     lea r4, [r1+r2]
  571.     jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
  572. %endmacro
  573.  
  574. MC MC13
  575.  
  576. ;-----------------------------------------------------------------------------
  577. ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
  578. ;-----------------------------------------------------------------------------
  579. %macro MC33 2
  580. cglobal_mc %1, mc33, %2, 3,6,8
  581.     lea r4, [r1+r2]
  582.     add r1, 2
  583.     jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
  584. %endmacro
  585.  
  586. MC MC33
  587.  
  588. ;-----------------------------------------------------------------------------
  589. ; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
  590. ;-----------------------------------------------------------------------------
  591. %macro FILT_H2 3
  592.     psubw  %1, %2  ; a-b
  593.     psubw  %2, %3  ; b-c
  594.     psllw  %2, 2
  595.     psubw  %1, %2  ; a-5*b+4*c
  596.     psllw  %3, 4
  597.     paddw  %1, %3  ; a-5*b+20*c
  598. %endmacro
  599.  
  600. %macro FILT_VNRD 8
  601.     movu     %6, [r1]
  602.     paddw    %1, %6
  603.     mova     %7, %2
  604.     paddw    %7, %5
  605.     mova     %8, %3
  606.     paddw    %8, %4
  607.     FILT_H2  %1, %7, %8
  608. %endmacro
  609.  
  610. %macro HV 1
  611. %if mmsize==16
  612. %define PAD 12
  613. %define COUNT 2
  614. %else
  615. %define PAD 4
  616. %define COUNT 3
  617. %endif
  618. put_hv%1_10:
  619.     neg      r2           ; This actually saves instructions
  620.     lea      r1, [r1+r2*2-mmsize+PAD]
  621.     lea      r4, [rsp+PAD+gprsize]
  622.     mov     r3d, COUNT
  623. .v_loop:
  624.     movu     m0, [r1]
  625.     sub      r1, r2
  626.     movu     m1, [r1]
  627.     sub      r1, r2
  628.     movu     m2, [r1]
  629.     sub      r1, r2
  630.     movu     m3, [r1]
  631.     sub      r1, r2
  632.     movu     m4, [r1]
  633.     sub      r1, r2
  634. %assign i 0
  635. %rep %1-1
  636.     FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  637.     psubw    m0, [pad20]
  638.     movu     [r4+i*mmsize*3], m0
  639.     sub      r1, r2
  640.     SWAP 0,1,2,3,4,5
  641. %assign i i+1
  642. %endrep
  643.     FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  644.     psubw    m0, [pad20]
  645.     movu     [r4+i*mmsize*3], m0
  646.     add      r4, mmsize
  647.     lea      r1, [r1+r2*8+mmsize]
  648. %if %1==8
  649.     lea      r1, [r1+r2*4]
  650. %endif
  651.     dec      r3d
  652.     jg .v_loop
  653.     neg      r2
  654.     ret
  655. %endmacro
  656.  
  657. INIT_MMX mmxext
  658. HV 4
  659. INIT_XMM sse2
  660. HV 8
  661.  
  662. %macro H_LOOP 1
  663. %if num_mmregs > 8
  664.     %define s1 m8
  665.     %define s2 m9
  666.     %define s3 m10
  667.     %define d1 m11
  668. %else
  669.     %define s1 [tap1]
  670.     %define s2 [tap2]
  671.     %define s3 [tap3]
  672.     %define d1 [depad]
  673. %endif
  674. h%1_loop_op:
  675.     movu       m1, [r1+mmsize-4]
  676.     movu       m2, [r1+mmsize-2]
  677.     mova       m3, [r1+mmsize+0]
  678.     movu       m4, [r1+mmsize+2]
  679.     movu       m5, [r1+mmsize+4]
  680.     movu       m6, [r1+mmsize+6]
  681. %if num_mmregs > 8
  682.     pmaddwd    m1, s1
  683.     pmaddwd    m2, s1
  684.     pmaddwd    m3, s2
  685.     pmaddwd    m4, s2
  686.     pmaddwd    m5, s3
  687.     pmaddwd    m6, s3
  688.     paddd      m1, d1
  689.     paddd      m2, d1
  690. %else
  691.     mova       m0, s1
  692.     pmaddwd    m1, m0
  693.     pmaddwd    m2, m0
  694.     mova       m0, s2
  695.     pmaddwd    m3, m0
  696.     pmaddwd    m4, m0
  697.     mova       m0, s3
  698.     pmaddwd    m5, m0
  699.     pmaddwd    m6, m0
  700.     mova       m0, d1
  701.     paddd      m1, m0
  702.     paddd      m2, m0
  703. %endif
  704.     paddd      m3, m5
  705.     paddd      m4, m6
  706.     paddd      m1, m3
  707.     paddd      m2, m4
  708.     psrad      m1, 10
  709.     psrad      m2, 10
  710.     pslld      m2, 16
  711.     pand       m1, [pd_0f]
  712.     por        m1, m2
  713. %if num_mmregs <= 8
  714.     pxor       m0, m0
  715. %endif
  716.     CLIPW      m1, m0, m7
  717.     add        r1, mmsize*3
  718.     ret
  719. %endmacro
  720.  
  721. INIT_MMX mmxext
  722. H_LOOP 4
  723. INIT_XMM sse2
  724. H_LOOP 8
  725.  
  726. %macro MC22 2
  727. cglobal_mc %1, mc22, %2, 3,7,12
  728. %define PAD mmsize*8*4*2      ; SIZE*16*4*sizeof(pixel)
  729.     mov      r6, rsp          ; backup stack pointer
  730.     and     rsp, ~(mmsize-1)  ; align stack
  731.     sub     rsp, PAD
  732.  
  733.     call put_hv%2_10
  734.  
  735.     mov       r3d, %2
  736.     mova       m7, [pw_pixel_max]
  737. %if num_mmregs > 8
  738.     pxor       m0, m0
  739.     mova       m8, [tap1]
  740.     mova       m9, [tap2]
  741.     mova      m10, [tap3]
  742.     mova      m11, [depad]
  743. %endif
  744.     mov        r1, rsp
  745. .h_loop:
  746.     call h%2_loop_op
  747.  
  748.     OP_MOV   [r0], m1
  749.     add        r0, r2
  750.     dec       r3d
  751.     jg .h_loop
  752.  
  753.     mov     rsp, r6          ; restore stack pointer
  754.     ret
  755. %endmacro
  756.  
  757. MC MC22
  758.  
  759. ;-----------------------------------------------------------------------------
  760. ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
  761. ;-----------------------------------------------------------------------------
  762. %macro MC12 2
  763. cglobal_mc %1, mc12, %2, 3,7,12
  764. %define PAD mmsize*8*4*2        ; SIZE*16*4*sizeof(pixel)
  765.     mov        r6, rsp          ; backup stack pointer
  766.     and       rsp, ~(mmsize-1)  ; align stack
  767.     sub       rsp, PAD
  768.  
  769.     call put_hv%2_10
  770.  
  771.     xor       r4d, r4d
  772. .body:
  773.     mov       r3d, %2
  774.     pxor       m0, m0
  775.     mova       m7, [pw_pixel_max]
  776. %if num_mmregs > 8
  777.     mova       m8, [tap1]
  778.     mova       m9, [tap2]
  779.     mova      m10, [tap3]
  780.     mova      m11, [depad]
  781. %endif
  782.     mov        r1, rsp
  783. .h_loop:
  784.     call h%2_loop_op
  785.  
  786.     movu       m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
  787.     paddw      m3, [depad2]
  788.     psrlw      m3, 5
  789.     psubw      m3, [unpad]
  790.     CLIPW      m3, m0, m7
  791.     pavgw      m1, m3
  792.  
  793.     OP_MOV   [r0], m1
  794.     add        r0, r2
  795.     dec       r3d
  796.     jg .h_loop
  797.  
  798.     mov     rsp, r6          ; restore stack pointer
  799.     ret
  800. %endmacro
  801.  
  802. MC MC12
  803.  
  804. ;-----------------------------------------------------------------------------
  805. ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
  806. ;-----------------------------------------------------------------------------
  807. %macro MC32 2
  808. cglobal_mc %1, mc32, %2, 3,7,12
  809. %define PAD mmsize*8*3*2  ; SIZE*16*4*sizeof(pixel)
  810.     mov  r6, rsp          ; backup stack pointer
  811.     and rsp, ~(mmsize-1)  ; align stack
  812.     sub rsp, PAD
  813.  
  814.     call put_hv%2_10
  815.  
  816.     mov r4d, 2            ; sizeof(pixel)
  817.     jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
  818. %endmacro
  819.  
  820. MC MC32
  821.  
  822. ;-----------------------------------------------------------------------------
  823. ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
  824. ;-----------------------------------------------------------------------------
  825. %macro H_NRD 1
  826. put_h%1_10:
  827.     add       rsp, gprsize
  828.     mov       r3d, %1
  829.     xor       r4d, r4d
  830.     mova       m6, [pad20]
  831. .nextrow:
  832.     movu       m2, [r5-4]
  833.     movu       m3, [r5-2]
  834.     movu       m4, [r5+0]
  835.     ADDW       m2, [r5+6], m5
  836.     ADDW       m3, [r5+4], m5
  837.     ADDW       m4, [r5+2], m5
  838.  
  839.     FILT_H2    m2, m3, m4
  840.     psubw      m2, m6
  841.     mova [rsp+r4], m2
  842.     add       r4d, mmsize*3
  843.     add        r5, r2
  844.     dec       r3d
  845.     jg .nextrow
  846.     sub       rsp, gprsize
  847.     ret
  848. %endmacro
  849.  
  850. INIT_MMX mmxext
  851. H_NRD 4
  852. INIT_XMM sse2
  853. H_NRD 8
  854.  
  855. %macro MC21 2
  856. cglobal_mc %1, mc21, %2, 3,7,12
  857.     mov   r5, r1
  858. .body:
  859. %define PAD mmsize*8*3*2   ; SIZE*16*4*sizeof(pixel)
  860.     mov   r6, rsp          ; backup stack pointer
  861.     and  rsp, ~(mmsize-1)  ; align stack
  862.  
  863.     sub  rsp, PAD
  864.     call put_h%2_10
  865.  
  866.     sub  rsp, PAD
  867.     call put_hv%2_10
  868.  
  869.     mov r4d, PAD-mmsize    ; H buffer
  870.     jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
  871. %endmacro
  872.  
  873. MC MC21
  874.  
  875. ;-----------------------------------------------------------------------------
  876. ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
  877. ;-----------------------------------------------------------------------------
  878. %macro MC23 2
  879. cglobal_mc %1, mc23, %2, 3,7,12
  880.     lea   r5, [r1+r2]
  881.     jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
  882. %endmacro
  883.  
  884. MC MC23
  885.