Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* MMX/SSE2-optimized functions for the RV40 decoder
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  5. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23.  
  24. %include "libavutil/x86/x86util.asm"
  25.  
  26. SECTION_RODATA
  27.  
  28. align 16
  29. pw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
  30.  
  31. sixtap_filter_hb_m:  times 8 db   1, -5
  32.                      times 8 db  52, 20
  33.                      ; multiplied by 2 to have the same shift
  34.                      times 8 db   2, -10
  35.                      times 8 db  40,  40
  36.                      ; back to normal
  37.                      times 8 db   1, -5
  38.                      times 8 db  20, 52
  39.  
  40. sixtap_filter_v_m:   times 8 dw   1
  41.                      times 8 dw  -5
  42.                      times 8 dw  52
  43.                      times 8 dw  20
  44.                      ; multiplied by 2 to have the same shift
  45.                      times 8 dw   2
  46.                      times 8 dw -10
  47.                      times 8 dw  40
  48.                      times 8 dw  40
  49.                      ; back to normal
  50.                      times 8 dw   1
  51.                      times 8 dw  -5
  52.                      times 8 dw  20
  53.                      times 8 dw  52
  54.  
  55. %ifdef PIC
  56. %define sixtap_filter_hw   picregq
  57. %define sixtap_filter_hb   picregq
  58. %define sixtap_filter_v    picregq
  59. %define npicregs 1
  60. %else
  61. %define sixtap_filter_hw   sixtap_filter_hw_m
  62. %define sixtap_filter_hb   sixtap_filter_hb_m
  63. %define sixtap_filter_v    sixtap_filter_v_m
  64. %define npicregs 0
  65. %endif
  66.  
  67. filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,  5, 6,  6,  7,  7,  8
  68. filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,  7, 8,  8,  9,  9, 10
  69. filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
  70.  
  71. cextern  pw_32
  72. cextern  pw_16
  73. cextern  pw_512
  74.  
  75. SECTION .text
  76.  
  77. ;-----------------------------------------------------------------------------
  78. ; subpel MC functions:
  79. ;
  80. ; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
  81. ;                                          uint8_t *src, int srcstride,
  82. ;                                          int len, int m);
  83. ;----------------------------------------------------------------------
  84. %macro LOAD  2
  85. %if WIN64
  86.    movsxd   %1q, %1d
  87. %endif
  88. %ifdef PIC
  89.    add      %1q, picregq
  90. %else
  91.    add      %1q, %2
  92. %endif
  93. %endmacro
  94.  
  95. %macro STORE 3
  96. %ifidn %3, avg
  97.     movh      %2, [dstq]
  98. %endif
  99.     packuswb  %1, %1
  100. %ifidn %3, avg
  101.     PAVGB     %1, %2
  102. %endif
  103.     movh  [dstq], %1
  104. %endmacro
  105.  
  106. %macro FILTER_V 1
  107. cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
  108. %ifdef PIC
  109.     lea  picregq, [sixtap_filter_v_m]
  110. %endif
  111.     pxor      m7, m7
  112.     LOAD      my, sixtap_filter_v
  113.  
  114.     ; read 5 lines
  115.     sub     srcq, srcstrideq
  116.     sub     srcq, srcstrideq
  117.     movh      m0, [srcq]
  118.     movh      m1, [srcq+srcstrideq]
  119.     movh      m2, [srcq+srcstrideq*2]
  120.     lea     srcq, [srcq+srcstrideq*2]
  121.     add     srcq, srcstrideq
  122.     movh      m3, [srcq]
  123.     movh      m4, [srcq+srcstrideq]
  124.     punpcklbw m0, m7
  125.     punpcklbw m1, m7
  126.     punpcklbw m2, m7
  127.     punpcklbw m3, m7
  128.     punpcklbw m4, m7
  129.  
  130. %ifdef m8
  131.     mova      m8, [myq+ 0]
  132.     mova      m9, [myq+16]
  133.     mova     m10, [myq+32]
  134.     mova     m11, [myq+48]
  135. %define COEFF05  m8
  136. %define COEFF14  m9
  137. %define COEFF2   m10
  138. %define COEFF3   m11
  139. %else
  140. %define COEFF05  [myq+ 0]
  141. %define COEFF14  [myq+16]
  142. %define COEFF2   [myq+32]
  143. %define COEFF3   [myq+48]
  144. %endif
  145. .nextrow:
  146.     mova      m6, m1
  147.     movh      m5, [srcq+2*srcstrideq]      ; read new row
  148.     paddw     m6, m4
  149.     punpcklbw m5, m7
  150.     pmullw    m6, COEFF14
  151.     paddw     m0, m5
  152.     pmullw    m0, COEFF05
  153.     paddw     m6, m0
  154.     mova      m0, m1
  155.     paddw     m6, [pw_32]
  156.     mova      m1, m2
  157.     pmullw    m2, COEFF2
  158.     paddw     m6, m2
  159.     mova      m2, m3
  160.     pmullw    m3, COEFF3
  161.     paddw     m6, m3
  162.  
  163.     ; round/clip/store
  164.     mova      m3, m4
  165.     psraw     m6, 6
  166.     mova      m4, m5
  167.     STORE     m6, m5, %1
  168.  
  169.     ; go to next line
  170.     add     dstq, dststrideq
  171.     add     srcq, srcstrideq
  172.     dec  heightd                           ; next row
  173.     jg .nextrow
  174.     REP_RET
  175. %endmacro
  176.  
  177. %macro FILTER_H  1
  178. cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
  179. %ifdef PIC
  180.     lea  picregq, [sixtap_filter_v_m]
  181. %endif
  182.     pxor      m7, m7
  183.     LOAD      mx, sixtap_filter_v
  184.     mova      m6, [pw_32]
  185. %ifdef m8
  186.     mova      m8, [mxq+ 0]
  187.     mova      m9, [mxq+16]
  188.     mova     m10, [mxq+32]
  189.     mova     m11, [mxq+48]
  190. %define COEFF05  m8
  191. %define COEFF14  m9
  192. %define COEFF2   m10
  193. %define COEFF3   m11
  194. %else
  195. %define COEFF05  [mxq+ 0]
  196. %define COEFF14  [mxq+16]
  197. %define COEFF2   [mxq+32]
  198. %define COEFF3   [mxq+48]
  199. %endif
  200. .nextrow:
  201.     movq      m0, [srcq-2]
  202.     movq      m5, [srcq+3]
  203.     movq      m1, [srcq-1]
  204.     movq      m4, [srcq+2]
  205.     punpcklbw m0, m7
  206.     punpcklbw m5, m7
  207.     punpcklbw m1, m7
  208.     punpcklbw m4, m7
  209.     movq      m2, [srcq-0]
  210.     movq      m3, [srcq+1]
  211.     paddw     m0, m5
  212.     paddw     m1, m4
  213.     punpcklbw m2, m7
  214.     punpcklbw m3, m7
  215.     pmullw    m0, COEFF05
  216.     pmullw    m1, COEFF14
  217.     pmullw    m2, COEFF2
  218.     pmullw    m3, COEFF3
  219.     paddw     m0, m6
  220.     paddw     m1, m2
  221.     paddw     m0, m3
  222.     paddw     m0, m1
  223.     psraw     m0, 6
  224.     STORE     m0, m1, %1
  225.  
  226.     ; go to next line
  227.     add     dstq, dststrideq
  228.     add     srcq, srcstrideq
  229.     dec  heightd            ; next row
  230.     jg .nextrow
  231.     REP_RET
  232. %endmacro
  233.  
  234. %if ARCH_X86_32
  235. INIT_MMX  mmx
  236. FILTER_V  put
  237. FILTER_H  put
  238.  
  239. INIT_MMX  mmxext
  240. FILTER_V  avg
  241. FILTER_H  avg
  242.  
  243. INIT_MMX  3dnow
  244. FILTER_V  avg
  245. FILTER_H  avg
  246. %endif
  247.  
  248. INIT_XMM  sse2
  249. FILTER_H  put
  250. FILTER_H  avg
  251. FILTER_V  put
  252. FILTER_V  avg
  253.  
  254. %macro FILTER_SSSE3 1
  255. cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
  256. %ifdef PIC
  257.     lea  picregq, [sixtap_filter_hb_m]
  258. %endif
  259.  
  260.     ; read 5 lines
  261.     sub     srcq, srcstrideq
  262.     LOAD      my, sixtap_filter_hb
  263.     sub     srcq, srcstrideq
  264.     movh      m0, [srcq]
  265.     movh      m1, [srcq+srcstrideq]
  266.     movh      m2, [srcq+srcstrideq*2]
  267.     lea     srcq, [srcq+srcstrideq*2]
  268.     add     srcq, srcstrideq
  269.     mova      m5, [myq]
  270.     movh      m3, [srcq]
  271.     movh      m4, [srcq+srcstrideq]
  272.     lea     srcq, [srcq+2*srcstrideq]
  273.  
  274. .nextrow:
  275.     mova      m6, m2
  276.     punpcklbw m0, m1
  277.     punpcklbw m6, m3
  278.     pmaddubsw m0, m5
  279.     pmaddubsw m6, [myq+16]
  280.     movh      m7, [srcq]      ; read new row
  281.     paddw     m6, m0
  282.     mova      m0, m1
  283.     mova      m1, m2
  284.     mova      m2, m3
  285.     mova      m3, m4
  286.     mova      m4, m7
  287.     punpcklbw m7, m3
  288.     pmaddubsw m7, m5
  289.     paddw     m6, m7
  290.     pmulhrsw  m6, [pw_512]
  291.     STORE     m6, m7, %1
  292.  
  293.     ; go to next line
  294.     add     dstq, dststrideq
  295.     add     srcq, srcstrideq
  296.     dec       heightd                          ; next row
  297.     jg       .nextrow
  298.     REP_RET
  299.  
  300. cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
  301. %ifdef PIC
  302.     lea  picregq, [sixtap_filter_hb_m]
  303. %endif
  304.     mova      m3, [filter_h6_shuf2]
  305.     mova      m4, [filter_h6_shuf3]
  306.     LOAD      mx, sixtap_filter_hb
  307.     mova      m5, [mxq] ; set up 6tap filter in bytes
  308.     mova      m6, [mxq+16]
  309.     mova      m7, [filter_h6_shuf1]
  310.  
  311. .nextrow:
  312.     movu      m0, [srcq-2]
  313.     mova      m1, m0
  314.     mova      m2, m0
  315.     pshufb    m0, m7
  316.     pshufb    m1, m3
  317.     pshufb    m2, m4
  318.     pmaddubsw m0, m5
  319.     pmaddubsw m1, m6
  320.     pmaddubsw m2, m5
  321.     paddw     m0, m1
  322.     paddw     m0, m2
  323.     pmulhrsw  m0, [pw_512]
  324.     STORE     m0, m1, %1
  325.  
  326.     ; go to next line
  327.     add     dstq, dststrideq
  328.     add     srcq, srcstrideq
  329.     dec  heightd            ; next row
  330.     jg .nextrow
  331.     REP_RET
  332. %endmacro
  333.  
  334. INIT_XMM ssse3
  335. FILTER_SSSE3  put
  336. FILTER_SSSE3  avg
  337.  
  338. ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
  339. %macro RV40_WCORE  4-5
  340.     movh       m4, [%3 + r6 + 0]
  341.     movh       m5, [%4 + r6 + 0]
  342. %if %0 == 4
  343. %define OFFSET r6 + mmsize / 2
  344. %else
  345.     ; 8x8 block and sse2, stride was provided
  346. %define OFFSET r6
  347.     add        r6, r5
  348. %endif
  349.     movh       m6, [%3 + OFFSET]
  350.     movh       m7, [%4 + OFFSET]
  351.  
  352. %if %1 == 0
  353.     ; 14bits weights
  354.     punpcklbw  m4, m0
  355.     punpcklbw  m5, m0
  356.     punpcklbw  m6, m0
  357.     punpcklbw  m7, m0
  358.  
  359.     psllw      m4, 7
  360.     psllw      m5, 7
  361.     psllw      m6, 7
  362.     psllw      m7, 7
  363.     pmulhw     m4, m3
  364.     pmulhw     m5, m2
  365.     pmulhw     m6, m3
  366.     pmulhw     m7, m2
  367.  
  368.     paddw      m4, m5
  369.     paddw      m6, m7
  370. %else
  371.     ; 5bits weights
  372. %if cpuflag(ssse3)
  373.     punpcklbw  m4, m5
  374.     punpcklbw  m6, m7
  375.  
  376.     pmaddubsw  m4, m3
  377.     pmaddubsw  m6, m3
  378. %else
  379.     punpcklbw  m4, m0
  380.     punpcklbw  m5, m0
  381.     punpcklbw  m6, m0
  382.     punpcklbw  m7, m0
  383.  
  384.     pmullw     m4, m3
  385.     pmullw     m5, m2
  386.     pmullw     m6, m3
  387.     pmullw     m7, m2
  388.     paddw      m4, m5
  389.     paddw      m6, m7
  390. %endif
  391.  
  392. %endif
  393.  
  394.     ; bias and shift down
  395. %if cpuflag(ssse3)
  396.     pmulhrsw   m4, m1
  397.     pmulhrsw   m6, m1
  398. %else
  399.     paddw      m4, m1
  400.     paddw      m6, m1
  401.     psrlw      m4, 5
  402.     psrlw      m6, 5
  403. %endif
  404.  
  405.     packuswb   m4, m6
  406. %if %0 == 5
  407.     ; Only called for 8x8 blocks and sse2
  408.     sub        r6, r5
  409.     movh       [%2 + r6], m4
  410.     add        r6, r5
  411.     movhps     [%2 + r6], m4
  412. %else
  413.     mova       [%2 + r6], m4
  414. %endif
  415. %endmacro
  416.  
  417.  
  418. %macro MAIN_LOOP   2
  419. %if mmsize == 8
  420.     RV40_WCORE %2, r0, r1, r2
  421. %if %1 == 16
  422.     RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
  423. %endif
  424.  
  425.     ; Prepare for next loop
  426.     add        r6, r5
  427. %else
  428. %ifidn %1, 8
  429.     RV40_WCORE %2, r0, r1, r2, r5
  430.     ; Prepare 2 next lines
  431.     add        r6, r5
  432. %else
  433.     RV40_WCORE %2, r0, r1, r2
  434.     ; Prepare single next line
  435.     add        r6, r5
  436. %endif
  437. %endif
  438.  
  439. %endmacro
  440.  
  441. ; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
  442. ; %1=size  %2=num of xmm regs
  443. ; The weights are FP0.14 notation of fractions depending on pts.
  444. ; For timebases without rounding error (i.e. PAL), the fractions
  445. ; can be simplified, and several operations can be avoided.
  446. ; Therefore, we check here whether they are multiples of 2^9 for
  447. ; those simplifications to occur.
  448. %macro RV40_WEIGHT  3
  449. cglobal rv40_weight_func_%1_%2, 6, 7, 8
  450. %if cpuflag(ssse3)
  451.     mova       m1, [pw_1024]
  452. %else
  453.     mova       m1, [pw_16]
  454. %endif
  455.     pxor       m0, m0
  456.     ; Set loop counter and increments
  457.     mov        r6, r5
  458.     shl        r6, %3
  459.     add        r0, r6
  460.     add        r1, r6
  461.     add        r2, r6
  462.     neg        r6
  463.  
  464.     movd       m2, r3d
  465.     movd       m3, r4d
  466. %ifidn %1,rnd
  467. %define  RND   0
  468.     SPLATW     m2, m2
  469. %else
  470. %define  RND   1
  471. %if cpuflag(ssse3)
  472.     punpcklbw  m3, m2
  473. %else
  474.     SPLATW     m2, m2
  475. %endif
  476. %endif
  477.     SPLATW     m3, m3
  478.  
  479. .loop:
  480.     MAIN_LOOP  %2, RND
  481.     jnz        .loop
  482.     REP_RET
  483. %endmacro
  484.  
  485. INIT_MMX mmxext
  486. RV40_WEIGHT   rnd,    8, 3
  487. RV40_WEIGHT   rnd,   16, 4
  488. RV40_WEIGHT   nornd,  8, 3
  489. RV40_WEIGHT   nornd, 16, 4
  490.  
  491. INIT_XMM sse2
  492. RV40_WEIGHT   rnd,    8, 3
  493. RV40_WEIGHT   rnd,   16, 4
  494. RV40_WEIGHT   nornd,  8, 3
  495. RV40_WEIGHT   nornd, 16, 4
  496.  
  497. INIT_XMM ssse3
  498. RV40_WEIGHT   rnd,    8, 3
  499. RV40_WEIGHT   rnd,   16, 4
  500. RV40_WEIGHT   nornd,  8, 3
  501. RV40_WEIGHT   nornd, 16, 4
  502.