Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* MMX/SSE2-optimized functions for the RV40 decoder
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  5. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23.  
  24. %include "libavutil/x86/x86util.asm"
  25.  
  26. SECTION_RODATA
  27.  
  28. align 16
  29. pw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
  30.  
  31. sixtap_filter_hb_m:  times 8 db   1, -5
  32.                      times 8 db  52, 20
  33.                      ; multiplied by 2 to have the same shift
  34.                      times 8 db   2, -10
  35.                      times 8 db  40,  40
  36.                      ; back to normal
  37.                      times 8 db   1, -5
  38.                      times 8 db  20, 52
  39.  
  40. sixtap_filter_v_m:   times 8 dw   1
  41.                      times 8 dw  -5
  42.                      times 8 dw  52
  43.                      times 8 dw  20
  44.                      ; multiplied by 2 to have the same shift
  45.                      times 8 dw   2
  46.                      times 8 dw -10
  47.                      times 8 dw  40
  48.                      times 8 dw  40
  49.                      ; back to normal
  50.                      times 8 dw   1
  51.                      times 8 dw  -5
  52.                      times 8 dw  20
  53.                      times 8 dw  52
  54.  
  55. %ifdef PIC
  56. %define sixtap_filter_hw   picregq
  57. %define sixtap_filter_hb   picregq
  58. %define sixtap_filter_v    picregq
  59. %define npicregs 1
  60. %else
  61. %define sixtap_filter_hw   sixtap_filter_hw_m
  62. %define sixtap_filter_hb   sixtap_filter_hb_m
  63. %define sixtap_filter_v    sixtap_filter_v_m
  64. %define npicregs 0
  65. %endif
  66.  
  67. filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,  5, 6,  6,  7,  7,  8
  68. filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,  7, 8,  8,  9,  9, 10
  69. filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
  70.  
  71. cextern  pw_32
  72. cextern  pw_16
  73. cextern  pw_512
  74.  
  75. SECTION .text
  76.  
  77. ;-----------------------------------------------------------------------------
  78. ; subpel MC functions:
  79. ;
  80. ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
  81. ;                                       uint8_t *src, int srcstride,
  82. ;                                       int len, int m);
  83. ;----------------------------------------------------------------------
  84. %macro LOAD  2
  85. %if WIN64
  86.    movsxd   %1q, %1d
  87. %endif
  88. %ifdef PIC
  89.    add      %1q, picregq
  90. %else
  91.    add      %1q, %2
  92. %endif
  93. %endmacro
  94.  
  95. %macro STORE 3
  96. %ifidn %3, avg
  97.     movh      %2, [dstq]
  98. %endif
  99.     packuswb  %1, %1
  100. %ifidn %3, avg
  101. %if cpuflag(3dnow)
  102.     pavgusb   %1, %2
  103. %else
  104.     pavgb     %1, %2
  105. %endif
  106. %endif
  107.     movh  [dstq], %1
  108. %endmacro
  109.  
  110. %macro FILTER_V 1
  111. cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
  112. %ifdef PIC
  113.     lea  picregq, [sixtap_filter_v_m]
  114. %endif
  115.     pxor      m7, m7
  116.     LOAD      my, sixtap_filter_v
  117.  
  118.     ; read 5 lines
  119.     sub     srcq, srcstrideq
  120.     sub     srcq, srcstrideq
  121.     movh      m0, [srcq]
  122.     movh      m1, [srcq+srcstrideq]
  123.     movh      m2, [srcq+srcstrideq*2]
  124.     lea     srcq, [srcq+srcstrideq*2]
  125.     add     srcq, srcstrideq
  126.     movh      m3, [srcq]
  127.     movh      m4, [srcq+srcstrideq]
  128.     punpcklbw m0, m7
  129.     punpcklbw m1, m7
  130.     punpcklbw m2, m7
  131.     punpcklbw m3, m7
  132.     punpcklbw m4, m7
  133.  
  134. %ifdef m8
  135.     mova      m8, [myq+ 0]
  136.     mova      m9, [myq+16]
  137.     mova     m10, [myq+32]
  138.     mova     m11, [myq+48]
  139. %define COEFF05  m8
  140. %define COEFF14  m9
  141. %define COEFF2   m10
  142. %define COEFF3   m11
  143. %else
  144. %define COEFF05  [myq+ 0]
  145. %define COEFF14  [myq+16]
  146. %define COEFF2   [myq+32]
  147. %define COEFF3   [myq+48]
  148. %endif
  149. .nextrow:
  150.     mova      m6, m1
  151.     movh      m5, [srcq+2*srcstrideq]      ; read new row
  152.     paddw     m6, m4
  153.     punpcklbw m5, m7
  154.     pmullw    m6, COEFF14
  155.     paddw     m0, m5
  156.     pmullw    m0, COEFF05
  157.     paddw     m6, m0
  158.     mova      m0, m1
  159.     paddw     m6, [pw_32]
  160.     mova      m1, m2
  161.     pmullw    m2, COEFF2
  162.     paddw     m6, m2
  163.     mova      m2, m3
  164.     pmullw    m3, COEFF3
  165.     paddw     m6, m3
  166.  
  167.     ; round/clip/store
  168.     mova      m3, m4
  169.     psraw     m6, 6
  170.     mova      m4, m5
  171.     STORE     m6, m5, %1
  172.  
  173.     ; go to next line
  174.     add     dstq, dststrideq
  175.     add     srcq, srcstrideq
  176.     dec  heightd                           ; next row
  177.     jg .nextrow
  178.     REP_RET
  179. %endmacro
  180.  
  181. %macro FILTER_H  1
  182. cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
  183. %ifdef PIC
  184.     lea  picregq, [sixtap_filter_v_m]
  185. %endif
  186.     pxor      m7, m7
  187.     LOAD      mx, sixtap_filter_v
  188.     mova      m6, [pw_32]
  189. %ifdef m8
  190.     mova      m8, [mxq+ 0]
  191.     mova      m9, [mxq+16]
  192.     mova     m10, [mxq+32]
  193.     mova     m11, [mxq+48]
  194. %define COEFF05  m8
  195. %define COEFF14  m9
  196. %define COEFF2   m10
  197. %define COEFF3   m11
  198. %else
  199. %define COEFF05  [mxq+ 0]
  200. %define COEFF14  [mxq+16]
  201. %define COEFF2   [mxq+32]
  202. %define COEFF3   [mxq+48]
  203. %endif
  204. .nextrow:
  205.     movq      m0, [srcq-2]
  206.     movq      m5, [srcq+3]
  207.     movq      m1, [srcq-1]
  208.     movq      m4, [srcq+2]
  209.     punpcklbw m0, m7
  210.     punpcklbw m5, m7
  211.     punpcklbw m1, m7
  212.     punpcklbw m4, m7
  213.     movq      m2, [srcq-0]
  214.     movq      m3, [srcq+1]
  215.     paddw     m0, m5
  216.     paddw     m1, m4
  217.     punpcklbw m2, m7
  218.     punpcklbw m3, m7
  219.     pmullw    m0, COEFF05
  220.     pmullw    m1, COEFF14
  221.     pmullw    m2, COEFF2
  222.     pmullw    m3, COEFF3
  223.     paddw     m0, m6
  224.     paddw     m1, m2
  225.     paddw     m0, m3
  226.     paddw     m0, m1
  227.     psraw     m0, 6
  228.     STORE     m0, m1, %1
  229.  
  230.     ; go to next line
  231.     add     dstq, dststrideq
  232.     add     srcq, srcstrideq
  233.     dec  heightd            ; next row
  234.     jg .nextrow
  235.     REP_RET
  236. %endmacro
  237.  
  238. %if ARCH_X86_32
  239. INIT_MMX  mmx
  240. FILTER_V  put
  241. FILTER_H  put
  242.  
  243. INIT_MMX  mmxext
  244. FILTER_V  avg
  245. FILTER_H  avg
  246.  
  247. INIT_MMX  3dnow
  248. FILTER_V  avg
  249. FILTER_H  avg
  250. %endif
  251.  
  252. INIT_XMM  sse2
  253. FILTER_H  put
  254. FILTER_H  avg
  255. FILTER_V  put
  256. FILTER_V  avg
  257.  
  258. %macro FILTER_SSSE3 1
  259. cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
  260. %ifdef PIC
  261.     lea  picregq, [sixtap_filter_hb_m]
  262. %endif
  263.  
  264.     ; read 5 lines
  265.     sub     srcq, srcstrideq
  266.     LOAD      my, sixtap_filter_hb
  267.     sub     srcq, srcstrideq
  268.     movh      m0, [srcq]
  269.     movh      m1, [srcq+srcstrideq]
  270.     movh      m2, [srcq+srcstrideq*2]
  271.     lea     srcq, [srcq+srcstrideq*2]
  272.     add     srcq, srcstrideq
  273.     mova      m5, [myq]
  274.     movh      m3, [srcq]
  275.     movh      m4, [srcq+srcstrideq]
  276.     lea     srcq, [srcq+2*srcstrideq]
  277.  
  278. .nextrow:
  279.     mova      m6, m2
  280.     punpcklbw m0, m1
  281.     punpcklbw m6, m3
  282.     pmaddubsw m0, m5
  283.     pmaddubsw m6, [myq+16]
  284.     movh      m7, [srcq]      ; read new row
  285.     paddw     m6, m0
  286.     mova      m0, m1
  287.     mova      m1, m2
  288.     mova      m2, m3
  289.     mova      m3, m4
  290.     mova      m4, m7
  291.     punpcklbw m7, m3
  292.     pmaddubsw m7, m5
  293.     paddw     m6, m7
  294.     pmulhrsw  m6, [pw_512]
  295.     STORE     m6, m7, %1
  296.  
  297.     ; go to next line
  298.     add     dstq, dststrideq
  299.     add     srcq, srcstrideq
  300.     dec       heightd                          ; next row
  301.     jg       .nextrow
  302.     REP_RET
  303.  
  304. cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
  305. %ifdef PIC
  306.     lea  picregq, [sixtap_filter_hb_m]
  307. %endif
  308.     mova      m3, [filter_h6_shuf2]
  309.     mova      m4, [filter_h6_shuf3]
  310.     LOAD      mx, sixtap_filter_hb
  311.     mova      m5, [mxq] ; set up 6tap filter in bytes
  312.     mova      m6, [mxq+16]
  313.     mova      m7, [filter_h6_shuf1]
  314.  
  315. .nextrow:
  316.     movu      m0, [srcq-2]
  317.     mova      m1, m0
  318.     mova      m2, m0
  319.     pshufb    m0, m7
  320.     pshufb    m1, m3
  321.     pshufb    m2, m4
  322.     pmaddubsw m0, m5
  323.     pmaddubsw m1, m6
  324.     pmaddubsw m2, m5
  325.     paddw     m0, m1
  326.     paddw     m0, m2
  327.     pmulhrsw  m0, [pw_512]
  328.     STORE     m0, m1, %1
  329.  
  330.     ; go to next line
  331.     add     dstq, dststrideq
  332.     add     srcq, srcstrideq
  333.     dec  heightd            ; next row
  334.     jg .nextrow
  335.     REP_RET
  336. %endmacro
  337.  
  338. INIT_XMM ssse3
  339. FILTER_SSSE3  put
  340. FILTER_SSSE3  avg
  341.  
  342. ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
  343. %macro RV40_WCORE  4-5
  344.     movh       m4, [%3 + r6 + 0]
  345.     movh       m5, [%4 + r6 + 0]
  346. %if %0 == 4
  347. %define OFFSET r6 + mmsize / 2
  348. %else
  349.     ; 8x8 block and sse2, stride was provided
  350. %define OFFSET r6
  351.     add        r6, r5
  352. %endif
  353.     movh       m6, [%3 + OFFSET]
  354.     movh       m7, [%4 + OFFSET]
  355.  
  356. %if %1 == 0
  357.     ; 14bits weights
  358.     punpcklbw  m4, m0
  359.     punpcklbw  m5, m0
  360.     punpcklbw  m6, m0
  361.     punpcklbw  m7, m0
  362.  
  363.     psllw      m4, 7
  364.     psllw      m5, 7
  365.     psllw      m6, 7
  366.     psllw      m7, 7
  367.     pmulhw     m4, m3
  368.     pmulhw     m5, m2
  369.     pmulhw     m6, m3
  370.     pmulhw     m7, m2
  371.  
  372.     paddw      m4, m5
  373.     paddw      m6, m7
  374. %else
  375.     ; 5bits weights
  376. %if cpuflag(ssse3)
  377.     punpcklbw  m4, m5
  378.     punpcklbw  m6, m7
  379.  
  380.     pmaddubsw  m4, m3
  381.     pmaddubsw  m6, m3
  382. %else
  383.     punpcklbw  m4, m0
  384.     punpcklbw  m5, m0
  385.     punpcklbw  m6, m0
  386.     punpcklbw  m7, m0
  387.  
  388.     pmullw     m4, m3
  389.     pmullw     m5, m2
  390.     pmullw     m6, m3
  391.     pmullw     m7, m2
  392.     paddw      m4, m5
  393.     paddw      m6, m7
  394. %endif
  395.  
  396. %endif
  397.  
  398.     ; bias and shift down
  399. %if cpuflag(ssse3)
  400.     pmulhrsw   m4, m1
  401.     pmulhrsw   m6, m1
  402. %else
  403.     paddw      m4, m1
  404.     paddw      m6, m1
  405.     psrlw      m4, 5
  406.     psrlw      m6, 5
  407. %endif
  408.  
  409.     packuswb   m4, m6
  410. %if %0 == 5
  411.     ; Only called for 8x8 blocks and sse2
  412.     sub        r6, r5
  413.     movh       [%2 + r6], m4
  414.     add        r6, r5
  415.     movhps     [%2 + r6], m4
  416. %else
  417.     mova       [%2 + r6], m4
  418. %endif
  419. %endmacro
  420.  
  421.  
  422. %macro MAIN_LOOP   2
  423. %if mmsize == 8
  424.     RV40_WCORE %2, r0, r1, r2
  425. %if %1 == 16
  426.     RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
  427. %endif
  428.  
  429.     ; Prepare for next loop
  430.     add        r6, r5
  431. %else
  432. %ifidn %1, 8
  433.     RV40_WCORE %2, r0, r1, r2, r5
  434.     ; Prepare 2 next lines
  435.     add        r6, r5
  436. %else
  437.     RV40_WCORE %2, r0, r1, r2
  438.     ; Prepare single next line
  439.     add        r6, r5
  440. %endif
  441. %endif
  442.  
  443. %endmacro
  444.  
  445. ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
  446. ; %1=size  %2=num of xmm regs
  447. ; The weights are FP0.14 notation of fractions depending on pts.
  448. ; For timebases without rounding error (i.e. PAL), the fractions
  449. ; can be simplified, and several operations can be avoided.
  450. ; Therefore, we check here whether they are multiples of 2^9 for
  451. ; those simplifications to occur.
  452. %macro RV40_WEIGHT  3
  453. cglobal rv40_weight_func_%1_%2, 6, 7, 8
  454. %if cpuflag(ssse3)
  455.     mova       m1, [pw_1024]
  456. %else
  457.     mova       m1, [pw_16]
  458. %endif
  459.     pxor       m0, m0
  460.     ; Set loop counter and increments
  461.     mov        r6, r5
  462.     shl        r6, %3
  463.     add        r0, r6
  464.     add        r1, r6
  465.     add        r2, r6
  466.     neg        r6
  467.  
  468.     movd       m2, r3d
  469.     movd       m3, r4d
  470. %ifidn %1,rnd
  471. %define  RND   0
  472.     SPLATW     m2, m2
  473. %else
  474. %define  RND   1
  475. %if cpuflag(ssse3)
  476.     punpcklbw  m3, m2
  477. %else
  478.     SPLATW     m2, m2
  479. %endif
  480. %endif
  481.     SPLATW     m3, m3
  482.  
  483. .loop:
  484.     MAIN_LOOP  %2, RND
  485.     jnz        .loop
  486.     REP_RET
  487. %endmacro
  488.  
  489. INIT_MMX mmxext
  490. RV40_WEIGHT   rnd,    8, 3
  491. RV40_WEIGHT   rnd,   16, 4
  492. RV40_WEIGHT   nornd,  8, 3
  493. RV40_WEIGHT   nornd, 16, 4
  494.  
  495. INIT_XMM sse2
  496. RV40_WEIGHT   rnd,    8, 3
  497. RV40_WEIGHT   rnd,   16, 4
  498. RV40_WEIGHT   nornd,  8, 3
  499. RV40_WEIGHT   nornd, 16, 4
  500.  
  501. INIT_XMM ssse3
  502. RV40_WEIGHT   rnd,    8, 3
  503. RV40_WEIGHT   rnd,   16, 4
  504. RV40_WEIGHT   nornd,  8, 3
  505. RV40_WEIGHT   nornd, 16, 4
  506.