Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * MMX optimized DSP utils
  3.  * Copyright (c) 2000, 2001 Fabrice Bellard
  4.  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5.  *
  6.  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7.  *
  8.  * This file is part of FFmpeg.
  9.  *
  10.  * FFmpeg is free software; you can redistribute it and/or
  11.  * modify it under the terms of the GNU Lesser General Public
  12.  * License as published by the Free Software Foundation; either
  13.  * version 2.1 of the License, or (at your option) any later version.
  14.  *
  15.  * FFmpeg is distributed in the hope that it will be useful,
  16.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18.  * Lesser General Public License for more details.
  19.  *
  20.  * You should have received a copy of the GNU Lesser General Public
  21.  * License along with FFmpeg; if not, write to the Free Software
  22.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23.  */
  24.  
  25. #include "libavutil/attributes.h"
  26. #include "libavutil/cpu.h"
  27. #include "libavutil/x86/asm.h"
  28. #include "libavutil/x86/cpu.h"
  29. #include "libavcodec/dct.h"
  30. #include "libavcodec/dsputil.h"
  31. #include "libavcodec/mpegvideo.h"
  32. #include "libavcodec/mathops.h"
  33. #include "dsputil_x86.h"
  34.  
  35. void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
  36. void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
  37. void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride);
  38. int ff_pix_sum16_mmx(uint8_t * pix, int line_size);
  39. int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
  40.  
  41. #if HAVE_INLINE_ASM
  42.  
  43. static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  44.     int tmp;
  45.   __asm__ volatile (
  46.       "movl %4,%%ecx\n"
  47.       "shr $1,%%ecx\n"
  48.       "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
  49.       "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
  50.       "1:\n"
  51.       "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
  52.       "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
  53.       "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
  54.       "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
  55.  
  56.       /* todo: mm1-mm2, mm3-mm4 */
  57.       /* algo: subtract mm1 from mm2 with saturation and vice versa */
  58.       /*       OR the results to get absolute difference */
  59.       "movq %%mm1,%%mm5\n"
  60.       "movq %%mm3,%%mm6\n"
  61.       "psubusb %%mm2,%%mm1\n"
  62.       "psubusb %%mm4,%%mm3\n"
  63.       "psubusb %%mm5,%%mm2\n"
  64.       "psubusb %%mm6,%%mm4\n"
  65.  
  66.       "por %%mm1,%%mm2\n"
  67.       "por %%mm3,%%mm4\n"
  68.  
  69.       /* now convert to 16-bit vectors so we can square them */
  70.       "movq %%mm2,%%mm1\n"
  71.       "movq %%mm4,%%mm3\n"
  72.  
  73.       "punpckhbw %%mm0,%%mm2\n"
  74.       "punpckhbw %%mm0,%%mm4\n"
  75.       "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  76.       "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  77.  
  78.       "pmaddwd %%mm2,%%mm2\n"
  79.       "pmaddwd %%mm4,%%mm4\n"
  80.       "pmaddwd %%mm1,%%mm1\n"
  81.       "pmaddwd %%mm3,%%mm3\n"
  82.  
  83.       "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
  84.       "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
  85.  
  86.       "paddd %%mm2,%%mm1\n"
  87.       "paddd %%mm4,%%mm3\n"
  88.       "paddd %%mm1,%%mm7\n"
  89.       "paddd %%mm3,%%mm7\n"
  90.  
  91.       "decl %%ecx\n"
  92.       "jnz 1b\n"
  93.  
  94.       "movq %%mm7,%%mm1\n"
  95.       "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
  96.       "paddd %%mm7,%%mm1\n"
  97.       "movd %%mm1,%2\n"
  98.       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  99.       : "r" ((x86_reg)line_size) , "m" (h)
  100.       : "%ecx");
  101.     return tmp;
  102. }
  103.  
  104. static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  105.     int tmp;
  106.   __asm__ volatile (
  107.       "movl %4,%%ecx\n"
  108.       "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
  109.       "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
  110.       "1:\n"
  111.       "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
  112.       "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
  113.       "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
  114.       "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
  115.  
  116.       /* todo: mm1-mm2, mm3-mm4 */
  117.       /* algo: subtract mm1 from mm2 with saturation and vice versa */
  118.       /*       OR the results to get absolute difference */
  119.       "movq %%mm1,%%mm5\n"
  120.       "movq %%mm3,%%mm6\n"
  121.       "psubusb %%mm2,%%mm1\n"
  122.       "psubusb %%mm4,%%mm3\n"
  123.       "psubusb %%mm5,%%mm2\n"
  124.       "psubusb %%mm6,%%mm4\n"
  125.  
  126.       "por %%mm1,%%mm2\n"
  127.       "por %%mm3,%%mm4\n"
  128.  
  129.       /* now convert to 16-bit vectors so we can square them */
  130.       "movq %%mm2,%%mm1\n"
  131.       "movq %%mm4,%%mm3\n"
  132.  
  133.       "punpckhbw %%mm0,%%mm2\n"
  134.       "punpckhbw %%mm0,%%mm4\n"
  135.       "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  136.       "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  137.  
  138.       "pmaddwd %%mm2,%%mm2\n"
  139.       "pmaddwd %%mm4,%%mm4\n"
  140.       "pmaddwd %%mm1,%%mm1\n"
  141.       "pmaddwd %%mm3,%%mm3\n"
  142.  
  143.       "add %3,%0\n"
  144.       "add %3,%1\n"
  145.  
  146.       "paddd %%mm2,%%mm1\n"
  147.       "paddd %%mm4,%%mm3\n"
  148.       "paddd %%mm1,%%mm7\n"
  149.       "paddd %%mm3,%%mm7\n"
  150.  
  151.       "decl %%ecx\n"
  152.       "jnz 1b\n"
  153.  
  154.       "movq %%mm7,%%mm1\n"
  155.       "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
  156.       "paddd %%mm7,%%mm1\n"
  157.       "movd %%mm1,%2\n"
  158.       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  159.       : "r" ((x86_reg)line_size) , "m" (h)
  160.       : "%ecx");
  161.     return tmp;
  162. }
  163.  
  164. static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
  165.     int tmp;
  166.   __asm__ volatile (
  167.       "movl %3,%%ecx\n"
  168.       "pxor %%mm7,%%mm7\n"
  169.       "pxor %%mm6,%%mm6\n"
  170.  
  171.       "movq (%0),%%mm0\n"
  172.       "movq %%mm0, %%mm1\n"
  173.       "psllq $8, %%mm0\n"
  174.       "psrlq $8, %%mm1\n"
  175.       "psrlq $8, %%mm0\n"
  176.       "movq %%mm0, %%mm2\n"
  177.       "movq %%mm1, %%mm3\n"
  178.       "punpcklbw %%mm7,%%mm0\n"
  179.       "punpcklbw %%mm7,%%mm1\n"
  180.       "punpckhbw %%mm7,%%mm2\n"
  181.       "punpckhbw %%mm7,%%mm3\n"
  182.       "psubw %%mm1, %%mm0\n"
  183.       "psubw %%mm3, %%mm2\n"
  184.  
  185.       "add %2,%0\n"
  186.  
  187.       "movq (%0),%%mm4\n"
  188.       "movq %%mm4, %%mm1\n"
  189.       "psllq $8, %%mm4\n"
  190.       "psrlq $8, %%mm1\n"
  191.       "psrlq $8, %%mm4\n"
  192.       "movq %%mm4, %%mm5\n"
  193.       "movq %%mm1, %%mm3\n"
  194.       "punpcklbw %%mm7,%%mm4\n"
  195.       "punpcklbw %%mm7,%%mm1\n"
  196.       "punpckhbw %%mm7,%%mm5\n"
  197.       "punpckhbw %%mm7,%%mm3\n"
  198.       "psubw %%mm1, %%mm4\n"
  199.       "psubw %%mm3, %%mm5\n"
  200.       "psubw %%mm4, %%mm0\n"
  201.       "psubw %%mm5, %%mm2\n"
  202.       "pxor %%mm3, %%mm3\n"
  203.       "pxor %%mm1, %%mm1\n"
  204.       "pcmpgtw %%mm0, %%mm3\n\t"
  205.       "pcmpgtw %%mm2, %%mm1\n\t"
  206.       "pxor %%mm3, %%mm0\n"
  207.       "pxor %%mm1, %%mm2\n"
  208.       "psubw %%mm3, %%mm0\n"
  209.       "psubw %%mm1, %%mm2\n"
  210.       "paddw %%mm0, %%mm2\n"
  211.       "paddw %%mm2, %%mm6\n"
  212.  
  213.       "add %2,%0\n"
  214.       "1:\n"
  215.  
  216.       "movq (%0),%%mm0\n"
  217.       "movq %%mm0, %%mm1\n"
  218.       "psllq $8, %%mm0\n"
  219.       "psrlq $8, %%mm1\n"
  220.       "psrlq $8, %%mm0\n"
  221.       "movq %%mm0, %%mm2\n"
  222.       "movq %%mm1, %%mm3\n"
  223.       "punpcklbw %%mm7,%%mm0\n"
  224.       "punpcklbw %%mm7,%%mm1\n"
  225.       "punpckhbw %%mm7,%%mm2\n"
  226.       "punpckhbw %%mm7,%%mm3\n"
  227.       "psubw %%mm1, %%mm0\n"
  228.       "psubw %%mm3, %%mm2\n"
  229.       "psubw %%mm0, %%mm4\n"
  230.       "psubw %%mm2, %%mm5\n"
  231.       "pxor %%mm3, %%mm3\n"
  232.       "pxor %%mm1, %%mm1\n"
  233.       "pcmpgtw %%mm4, %%mm3\n\t"
  234.       "pcmpgtw %%mm5, %%mm1\n\t"
  235.       "pxor %%mm3, %%mm4\n"
  236.       "pxor %%mm1, %%mm5\n"
  237.       "psubw %%mm3, %%mm4\n"
  238.       "psubw %%mm1, %%mm5\n"
  239.       "paddw %%mm4, %%mm5\n"
  240.       "paddw %%mm5, %%mm6\n"
  241.  
  242.       "add %2,%0\n"
  243.  
  244.       "movq (%0),%%mm4\n"
  245.       "movq %%mm4, %%mm1\n"
  246.       "psllq $8, %%mm4\n"
  247.       "psrlq $8, %%mm1\n"
  248.       "psrlq $8, %%mm4\n"
  249.       "movq %%mm4, %%mm5\n"
  250.       "movq %%mm1, %%mm3\n"
  251.       "punpcklbw %%mm7,%%mm4\n"
  252.       "punpcklbw %%mm7,%%mm1\n"
  253.       "punpckhbw %%mm7,%%mm5\n"
  254.       "punpckhbw %%mm7,%%mm3\n"
  255.       "psubw %%mm1, %%mm4\n"
  256.       "psubw %%mm3, %%mm5\n"
  257.       "psubw %%mm4, %%mm0\n"
  258.       "psubw %%mm5, %%mm2\n"
  259.       "pxor %%mm3, %%mm3\n"
  260.       "pxor %%mm1, %%mm1\n"
  261.       "pcmpgtw %%mm0, %%mm3\n\t"
  262.       "pcmpgtw %%mm2, %%mm1\n\t"
  263.       "pxor %%mm3, %%mm0\n"
  264.       "pxor %%mm1, %%mm2\n"
  265.       "psubw %%mm3, %%mm0\n"
  266.       "psubw %%mm1, %%mm2\n"
  267.       "paddw %%mm0, %%mm2\n"
  268.       "paddw %%mm2, %%mm6\n"
  269.  
  270.       "add %2,%0\n"
  271.       "subl $2, %%ecx\n"
  272.       " jnz 1b\n"
  273.  
  274.       "movq %%mm6, %%mm0\n"
  275.       "punpcklwd %%mm7,%%mm0\n"
  276.       "punpckhwd %%mm7,%%mm6\n"
  277.       "paddd %%mm0, %%mm6\n"
  278.  
  279.       "movq %%mm6,%%mm0\n"
  280.       "psrlq $32, %%mm6\n"
  281.       "paddd %%mm6,%%mm0\n"
  282.       "movd %%mm0,%1\n"
  283.       : "+r" (pix1), "=r"(tmp)
  284.       : "r" ((x86_reg)line_size) , "g" (h-2)
  285.       : "%ecx");
  286.       return tmp;
  287. }
  288.  
  289. static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
  290.     int tmp;
  291.     uint8_t * pix= pix1;
  292.   __asm__ volatile (
  293.       "movl %3,%%ecx\n"
  294.       "pxor %%mm7,%%mm7\n"
  295.       "pxor %%mm6,%%mm6\n"
  296.  
  297.       "movq (%0),%%mm0\n"
  298.       "movq 1(%0),%%mm1\n"
  299.       "movq %%mm0, %%mm2\n"
  300.       "movq %%mm1, %%mm3\n"
  301.       "punpcklbw %%mm7,%%mm0\n"
  302.       "punpcklbw %%mm7,%%mm1\n"
  303.       "punpckhbw %%mm7,%%mm2\n"
  304.       "punpckhbw %%mm7,%%mm3\n"
  305.       "psubw %%mm1, %%mm0\n"
  306.       "psubw %%mm3, %%mm2\n"
  307.  
  308.       "add %2,%0\n"
  309.  
  310.       "movq (%0),%%mm4\n"
  311.       "movq 1(%0),%%mm1\n"
  312.       "movq %%mm4, %%mm5\n"
  313.       "movq %%mm1, %%mm3\n"
  314.       "punpcklbw %%mm7,%%mm4\n"
  315.       "punpcklbw %%mm7,%%mm1\n"
  316.       "punpckhbw %%mm7,%%mm5\n"
  317.       "punpckhbw %%mm7,%%mm3\n"
  318.       "psubw %%mm1, %%mm4\n"
  319.       "psubw %%mm3, %%mm5\n"
  320.       "psubw %%mm4, %%mm0\n"
  321.       "psubw %%mm5, %%mm2\n"
  322.       "pxor %%mm3, %%mm3\n"
  323.       "pxor %%mm1, %%mm1\n"
  324.       "pcmpgtw %%mm0, %%mm3\n\t"
  325.       "pcmpgtw %%mm2, %%mm1\n\t"
  326.       "pxor %%mm3, %%mm0\n"
  327.       "pxor %%mm1, %%mm2\n"
  328.       "psubw %%mm3, %%mm0\n"
  329.       "psubw %%mm1, %%mm2\n"
  330.       "paddw %%mm0, %%mm2\n"
  331.       "paddw %%mm2, %%mm6\n"
  332.  
  333.       "add %2,%0\n"
  334.       "1:\n"
  335.  
  336.       "movq (%0),%%mm0\n"
  337.       "movq 1(%0),%%mm1\n"
  338.       "movq %%mm0, %%mm2\n"
  339.       "movq %%mm1, %%mm3\n"
  340.       "punpcklbw %%mm7,%%mm0\n"
  341.       "punpcklbw %%mm7,%%mm1\n"
  342.       "punpckhbw %%mm7,%%mm2\n"
  343.       "punpckhbw %%mm7,%%mm3\n"
  344.       "psubw %%mm1, %%mm0\n"
  345.       "psubw %%mm3, %%mm2\n"
  346.       "psubw %%mm0, %%mm4\n"
  347.       "psubw %%mm2, %%mm5\n"
  348.       "pxor %%mm3, %%mm3\n"
  349.       "pxor %%mm1, %%mm1\n"
  350.       "pcmpgtw %%mm4, %%mm3\n\t"
  351.       "pcmpgtw %%mm5, %%mm1\n\t"
  352.       "pxor %%mm3, %%mm4\n"
  353.       "pxor %%mm1, %%mm5\n"
  354.       "psubw %%mm3, %%mm4\n"
  355.       "psubw %%mm1, %%mm5\n"
  356.       "paddw %%mm4, %%mm5\n"
  357.       "paddw %%mm5, %%mm6\n"
  358.  
  359.       "add %2,%0\n"
  360.  
  361.       "movq (%0),%%mm4\n"
  362.       "movq 1(%0),%%mm1\n"
  363.       "movq %%mm4, %%mm5\n"
  364.       "movq %%mm1, %%mm3\n"
  365.       "punpcklbw %%mm7,%%mm4\n"
  366.       "punpcklbw %%mm7,%%mm1\n"
  367.       "punpckhbw %%mm7,%%mm5\n"
  368.       "punpckhbw %%mm7,%%mm3\n"
  369.       "psubw %%mm1, %%mm4\n"
  370.       "psubw %%mm3, %%mm5\n"
  371.       "psubw %%mm4, %%mm0\n"
  372.       "psubw %%mm5, %%mm2\n"
  373.       "pxor %%mm3, %%mm3\n"
  374.       "pxor %%mm1, %%mm1\n"
  375.       "pcmpgtw %%mm0, %%mm3\n\t"
  376.       "pcmpgtw %%mm2, %%mm1\n\t"
  377.       "pxor %%mm3, %%mm0\n"
  378.       "pxor %%mm1, %%mm2\n"
  379.       "psubw %%mm3, %%mm0\n"
  380.       "psubw %%mm1, %%mm2\n"
  381.       "paddw %%mm0, %%mm2\n"
  382.       "paddw %%mm2, %%mm6\n"
  383.  
  384.       "add %2,%0\n"
  385.       "subl $2, %%ecx\n"
  386.       " jnz 1b\n"
  387.  
  388.       "movq %%mm6, %%mm0\n"
  389.       "punpcklwd %%mm7,%%mm0\n"
  390.       "punpckhwd %%mm7,%%mm6\n"
  391.       "paddd %%mm0, %%mm6\n"
  392.  
  393.       "movq %%mm6,%%mm0\n"
  394.       "psrlq $32, %%mm6\n"
  395.       "paddd %%mm6,%%mm0\n"
  396.       "movd %%mm0,%1\n"
  397.       : "+r" (pix1), "=r"(tmp)
  398.       : "r" ((x86_reg)line_size) , "g" (h-2)
  399.       : "%ecx");
  400.       return tmp + hf_noise8_mmx(pix+8, line_size, h);
  401. }
  402.  
  403. static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  404.     MpegEncContext *c = p;
  405.     int score1, score2;
  406.  
  407.     if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
  408.     else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
  409.     score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
  410.  
  411.     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
  412.     else  return score1 + FFABS(score2)*8;
  413. }
  414.  
  415. static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  416.     MpegEncContext *c = p;
  417.     int score1= sse8_mmx(c, pix1, pix2, line_size, h);
  418.     int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
  419.  
  420.     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
  421.     else  return score1 + FFABS(score2)*8;
  422. }
  423.  
  424. static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
  425.     int tmp;
  426.  
  427.     av_assert2( (((int)pix) & 7) == 0);
  428.     av_assert2((line_size &7) ==0);
  429.  
  430. #define SUM(in0, in1, out0, out1) \
  431.       "movq (%0), %%mm2\n"\
  432.       "movq 8(%0), %%mm3\n"\
  433.       "add %2,%0\n"\
  434.       "movq %%mm2, " #out0 "\n"\
  435.       "movq %%mm3, " #out1 "\n"\
  436.       "psubusb " #in0 ", %%mm2\n"\
  437.       "psubusb " #in1 ", %%mm3\n"\
  438.       "psubusb " #out0 ", " #in0 "\n"\
  439.       "psubusb " #out1 ", " #in1 "\n"\
  440.       "por %%mm2, " #in0 "\n"\
  441.       "por %%mm3, " #in1 "\n"\
  442.       "movq " #in0 ", %%mm2\n"\
  443.       "movq " #in1 ", %%mm3\n"\
  444.       "punpcklbw %%mm7, " #in0 "\n"\
  445.       "punpcklbw %%mm7, " #in1 "\n"\
  446.       "punpckhbw %%mm7, %%mm2\n"\
  447.       "punpckhbw %%mm7, %%mm3\n"\
  448.       "paddw " #in1 ", " #in0 "\n"\
  449.       "paddw %%mm3, %%mm2\n"\
  450.       "paddw %%mm2, " #in0 "\n"\
  451.       "paddw " #in0 ", %%mm6\n"
  452.  
  453.  
  454.   __asm__ volatile (
  455.       "movl %3,%%ecx\n"
  456.       "pxor %%mm6,%%mm6\n"
  457.       "pxor %%mm7,%%mm7\n"
  458.       "movq (%0),%%mm0\n"
  459.       "movq 8(%0),%%mm1\n"
  460.       "add %2,%0\n"
  461.       "jmp 2f\n"
  462.       "1:\n"
  463.  
  464.       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  465.       "2:\n"
  466.       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  467.  
  468.       "subl $2, %%ecx\n"
  469.       "jnz 1b\n"
  470.  
  471.       "movq %%mm6,%%mm0\n"
  472.       "psrlq $32, %%mm6\n"
  473.       "paddw %%mm6,%%mm0\n"
  474.       "movq %%mm0,%%mm6\n"
  475.       "psrlq $16, %%mm0\n"
  476.       "paddw %%mm6,%%mm0\n"
  477.       "movd %%mm0,%1\n"
  478.       : "+r" (pix), "=r"(tmp)
  479.       : "r" ((x86_reg)line_size) , "m" (h)
  480.       : "%ecx");
  481.     return tmp & 0xFFFF;
  482. }
  483. #undef SUM
  484.  
  485. static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy,
  486.                                int line_size, int h)
  487. {
  488.     int tmp;
  489.  
  490.     av_assert2( (((int)pix) & 7) == 0);
  491.     av_assert2((line_size &7) ==0);
  492.  
  493. #define SUM(in0, in1, out0, out1) \
  494.       "movq (%0), " #out0 "\n"\
  495.       "movq 8(%0), " #out1 "\n"\
  496.       "add %2,%0\n"\
  497.       "psadbw " #out0 ", " #in0 "\n"\
  498.       "psadbw " #out1 ", " #in1 "\n"\
  499.       "paddw " #in1 ", " #in0 "\n"\
  500.       "paddw " #in0 ", %%mm6\n"
  501.  
  502.   __asm__ volatile (
  503.       "movl %3,%%ecx\n"
  504.       "pxor %%mm6,%%mm6\n"
  505.       "pxor %%mm7,%%mm7\n"
  506.       "movq (%0),%%mm0\n"
  507.       "movq 8(%0),%%mm1\n"
  508.       "add %2,%0\n"
  509.       "jmp 2f\n"
  510.       "1:\n"
  511.  
  512.       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  513.       "2:\n"
  514.       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  515.  
  516.       "subl $2, %%ecx\n"
  517.       "jnz 1b\n"
  518.  
  519.       "movd %%mm6,%1\n"
  520.       : "+r" (pix), "=r"(tmp)
  521.       : "r" ((x86_reg)line_size) , "m" (h)
  522.       : "%ecx");
  523.     return tmp;
  524. }
  525. #undef SUM
  526.  
  527. static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  528.     int tmp;
  529.  
  530.     av_assert2( (((int)pix1) & 7) == 0);
  531.     av_assert2( (((int)pix2) & 7) == 0);
  532.     av_assert2((line_size &7) ==0);
  533.  
  534. #define SUM(in0, in1, out0, out1) \
  535.       "movq (%0),%%mm2\n"\
  536.       "movq (%1)," #out0 "\n"\
  537.       "movq 8(%0),%%mm3\n"\
  538.       "movq 8(%1)," #out1 "\n"\
  539.       "add %3,%0\n"\
  540.       "add %3,%1\n"\
  541.       "psubb " #out0 ", %%mm2\n"\
  542.       "psubb " #out1 ", %%mm3\n"\
  543.       "pxor %%mm7, %%mm2\n"\
  544.       "pxor %%mm7, %%mm3\n"\
  545.       "movq %%mm2, " #out0 "\n"\
  546.       "movq %%mm3, " #out1 "\n"\
  547.       "psubusb " #in0 ", %%mm2\n"\
  548.       "psubusb " #in1 ", %%mm3\n"\
  549.       "psubusb " #out0 ", " #in0 "\n"\
  550.       "psubusb " #out1 ", " #in1 "\n"\
  551.       "por %%mm2, " #in0 "\n"\
  552.       "por %%mm3, " #in1 "\n"\
  553.       "movq " #in0 ", %%mm2\n"\
  554.       "movq " #in1 ", %%mm3\n"\
  555.       "punpcklbw %%mm7, " #in0 "\n"\
  556.       "punpcklbw %%mm7, " #in1 "\n"\
  557.       "punpckhbw %%mm7, %%mm2\n"\
  558.       "punpckhbw %%mm7, %%mm3\n"\
  559.       "paddw " #in1 ", " #in0 "\n"\
  560.       "paddw %%mm3, %%mm2\n"\
  561.       "paddw %%mm2, " #in0 "\n"\
  562.       "paddw " #in0 ", %%mm6\n"
  563.  
  564.  
  565.   __asm__ volatile (
  566.       "movl %4,%%ecx\n"
  567.       "pxor %%mm6,%%mm6\n"
  568.       "pcmpeqw %%mm7,%%mm7\n"
  569.       "psllw $15, %%mm7\n"
  570.       "packsswb %%mm7, %%mm7\n"
  571.       "movq (%0),%%mm0\n"
  572.       "movq (%1),%%mm2\n"
  573.       "movq 8(%0),%%mm1\n"
  574.       "movq 8(%1),%%mm3\n"
  575.       "add %3,%0\n"
  576.       "add %3,%1\n"
  577.       "psubb %%mm2, %%mm0\n"
  578.       "psubb %%mm3, %%mm1\n"
  579.       "pxor %%mm7, %%mm0\n"
  580.       "pxor %%mm7, %%mm1\n"
  581.       "jmp 2f\n"
  582.       "1:\n"
  583.  
  584.       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  585.       "2:\n"
  586.       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  587.  
  588.       "subl $2, %%ecx\n"
  589.       "jnz 1b\n"
  590.  
  591.       "movq %%mm6,%%mm0\n"
  592.       "psrlq $32, %%mm6\n"
  593.       "paddw %%mm6,%%mm0\n"
  594.       "movq %%mm0,%%mm6\n"
  595.       "psrlq $16, %%mm0\n"
  596.       "paddw %%mm6,%%mm0\n"
  597.       "movd %%mm0,%2\n"
  598.       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  599.       : "r" ((x86_reg)line_size) , "m" (h)
  600.       : "%ecx");
  601.     return tmp & 0x7FFF;
  602. }
  603. #undef SUM
  604.  
  605. static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2,
  606.                          int line_size, int h)
  607. {
  608.     int tmp;
  609.  
  610.     av_assert2( (((int)pix1) & 7) == 0);
  611.     av_assert2( (((int)pix2) & 7) == 0);
  612.     av_assert2((line_size &7) ==0);
  613.  
  614. #define SUM(in0, in1, out0, out1) \
  615.       "movq (%0)," #out0 "\n"\
  616.       "movq (%1),%%mm2\n"\
  617.       "movq 8(%0)," #out1 "\n"\
  618.       "movq 8(%1),%%mm3\n"\
  619.       "add %3,%0\n"\
  620.       "add %3,%1\n"\
  621.       "psubb %%mm2, " #out0 "\n"\
  622.       "psubb %%mm3, " #out1 "\n"\
  623.       "pxor %%mm7, " #out0 "\n"\
  624.       "pxor %%mm7, " #out1 "\n"\
  625.       "psadbw " #out0 ", " #in0 "\n"\
  626.       "psadbw " #out1 ", " #in1 "\n"\
  627.       "paddw " #in1 ", " #in0 "\n"\
  628.       "paddw " #in0 ", %%mm6\n"
  629.  
  630.   __asm__ volatile (
  631.       "movl %4,%%ecx\n"
  632.       "pxor %%mm6,%%mm6\n"
  633.       "pcmpeqw %%mm7,%%mm7\n"
  634.       "psllw $15, %%mm7\n"
  635.       "packsswb %%mm7, %%mm7\n"
  636.       "movq (%0),%%mm0\n"
  637.       "movq (%1),%%mm2\n"
  638.       "movq 8(%0),%%mm1\n"
  639.       "movq 8(%1),%%mm3\n"
  640.       "add %3,%0\n"
  641.       "add %3,%1\n"
  642.       "psubb %%mm2, %%mm0\n"
  643.       "psubb %%mm3, %%mm1\n"
  644.       "pxor %%mm7, %%mm0\n"
  645.       "pxor %%mm7, %%mm1\n"
  646.       "jmp 2f\n"
  647.       "1:\n"
  648.  
  649.       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  650.       "2:\n"
  651.       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  652.  
  653.       "subl $2, %%ecx\n"
  654.       "jnz 1b\n"
  655.  
  656.       "movd %%mm6,%2\n"
  657.       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  658.       : "r" ((x86_reg)line_size) , "m" (h)
  659.       : "%ecx");
  660.     return tmp;
  661. }
  662. #undef SUM
  663.  
  664. static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
  665.     x86_reg i=0;
  666.     if(w>=16)
  667.     __asm__ volatile(
  668.         "1:                             \n\t"
  669.         "movq  (%2, %0), %%mm0          \n\t"
  670.         "movq  (%1, %0), %%mm1          \n\t"
  671.         "psubb %%mm0, %%mm1             \n\t"
  672.         "movq %%mm1, (%3, %0)           \n\t"
  673.         "movq 8(%2, %0), %%mm0          \n\t"
  674.         "movq 8(%1, %0), %%mm1          \n\t"
  675.         "psubb %%mm0, %%mm1             \n\t"
  676.         "movq %%mm1, 8(%3, %0)          \n\t"
  677.         "add $16, %0                    \n\t"
  678.         "cmp %4, %0                     \n\t"
  679.         " jb 1b                         \n\t"
  680.         : "+r" (i)
  681.         : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
  682.     );
  683.     for(; i<w; i++)
  684.         dst[i+0] = src1[i+0]-src2[i+0];
  685. }
  686.  
  687. static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1,
  688.                                               const uint8_t *src2, int w,
  689.                                               int *left, int *left_top)
  690. {
  691.     x86_reg i=0;
  692.     uint8_t l, lt;
  693.  
  694.     __asm__ volatile(
  695.         "movq  (%1, %0), %%mm0          \n\t" // LT
  696.         "psllq $8, %%mm0                \n\t"
  697.         "1:                             \n\t"
  698.         "movq  (%1, %0), %%mm1          \n\t" // T
  699.         "movq  -1(%2, %0), %%mm2        \n\t" // L
  700.         "movq  (%2, %0), %%mm3          \n\t" // X
  701.         "movq %%mm2, %%mm4              \n\t" // L
  702.         "psubb %%mm0, %%mm2             \n\t"
  703.         "paddb %%mm1, %%mm2             \n\t" // L + T - LT
  704.         "movq %%mm4, %%mm5              \n\t" // L
  705.         "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
  706.         "pminub %%mm5, %%mm1            \n\t" // min(T, L)
  707.         "pminub %%mm2, %%mm4            \n\t"
  708.         "pmaxub %%mm1, %%mm4            \n\t"
  709.         "psubb %%mm4, %%mm3             \n\t" // dst - pred
  710.         "movq %%mm3, (%3, %0)           \n\t"
  711.         "add $8, %0                     \n\t"
  712.         "movq -1(%1, %0), %%mm0         \n\t" // LT
  713.         "cmp %4, %0                     \n\t"
  714.         " jb 1b                         \n\t"
  715.         : "+r" (i)
  716.         : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
  717.     );
  718.  
  719.     l= *left;
  720.     lt= *left_top;
  721.  
  722.     dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
  723.  
  724.     *left_top= src1[w-1];
  725.     *left    = src2[w-1];
  726. }
  727.  
  728. #define MMABS_MMX(a,z)\
  729.     "pxor " #z ", " #z "              \n\t"\
  730.     "pcmpgtw " #a ", " #z "           \n\t"\
  731.     "pxor " #z ", " #a "              \n\t"\
  732.     "psubw " #z ", " #a "             \n\t"
  733.  
  734. #define MMABS_MMXEXT(a, z)                 \
  735.     "pxor " #z ", " #z "              \n\t"\
  736.     "psubw " #a ", " #z "             \n\t"\
  737.     "pmaxsw " #z ", " #a "            \n\t"
  738.  
  739. #define MMABS_SSSE3(a,z)\
  740.     "pabsw " #a ", " #a "             \n\t"
  741.  
  742. #define MMABS_SUM(a,z, sum)\
  743.     MMABS(a,z)\
  744.     "paddusw " #a ", " #sum "         \n\t"
  745.  
  746. /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
  747.  * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
  748.  * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
  749. #define HSUM_MMX(a, t, dst)\
  750.     "movq "#a", "#t"                  \n\t"\
  751.     "psrlq $32, "#a"                  \n\t"\
  752.     "paddusw "#t", "#a"               \n\t"\
  753.     "movq "#a", "#t"                  \n\t"\
  754.     "psrlq $16, "#a"                  \n\t"\
  755.     "paddusw "#t", "#a"               \n\t"\
  756.     "movd "#a", "#dst"                \n\t"\
  757.  
  758. #define HSUM_MMXEXT(a, t, dst)             \
  759.     "pshufw $0x0E, "#a", "#t"         \n\t"\
  760.     "paddusw "#t", "#a"               \n\t"\
  761.     "pshufw $0x01, "#a", "#t"         \n\t"\
  762.     "paddusw "#t", "#a"               \n\t"\
  763.     "movd "#a", "#dst"                \n\t"\
  764.  
  765. #define HSUM_SSE2(a, t, dst)\
  766.     "movhlps "#a", "#t"               \n\t"\
  767.     "paddusw "#t", "#a"               \n\t"\
  768.     "pshuflw $0x0E, "#a", "#t"        \n\t"\
  769.     "paddusw "#t", "#a"               \n\t"\
  770.     "pshuflw $0x01, "#a", "#t"        \n\t"\
  771.     "paddusw "#t", "#a"               \n\t"\
  772.     "movd "#a", "#dst"                \n\t"\
  773.  
  774. #define DCT_SAD4(m,mm,o)\
  775.     "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
  776.     "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
  777.     "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
  778.     "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
  779.     MMABS_SUM(mm##2, mm##6, mm##0)\
  780.     MMABS_SUM(mm##3, mm##7, mm##1)\
  781.     MMABS_SUM(mm##4, mm##6, mm##0)\
  782.     MMABS_SUM(mm##5, mm##7, mm##1)\
  783.  
  784. #define DCT_SAD_MMX\
  785.     "pxor %%mm0, %%mm0                \n\t"\
  786.     "pxor %%mm1, %%mm1                \n\t"\
  787.     DCT_SAD4(q, %%mm, 0)\
  788.     DCT_SAD4(q, %%mm, 8)\
  789.     DCT_SAD4(q, %%mm, 64)\
  790.     DCT_SAD4(q, %%mm, 72)\
  791.     "paddusw %%mm1, %%mm0             \n\t"\
  792.     HSUM(%%mm0, %%mm1, %0)
  793.  
  794. #define DCT_SAD_SSE2\
  795.     "pxor %%xmm0, %%xmm0              \n\t"\
  796.     "pxor %%xmm1, %%xmm1              \n\t"\
  797.     DCT_SAD4(dqa, %%xmm, 0)\
  798.     DCT_SAD4(dqa, %%xmm, 64)\
  799.     "paddusw %%xmm1, %%xmm0           \n\t"\
  800.     HSUM(%%xmm0, %%xmm1, %0)
  801.  
  802. #define DCT_SAD_FUNC(cpu) \
  803. static int sum_abs_dctelem_##cpu(int16_t *block){\
  804.     int sum;\
  805.     __asm__ volatile(\
  806.         DCT_SAD\
  807.         :"=r"(sum)\
  808.         :"r"(block)\
  809.     );\
  810.     return sum&0xFFFF;\
  811. }
  812.  
  813. #define DCT_SAD       DCT_SAD_MMX
  814. #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
  815. #define MMABS(a,z)    MMABS_MMX(a,z)
  816. DCT_SAD_FUNC(mmx)
  817. #undef MMABS
  818. #undef HSUM
  819.  
  820. #define HSUM(a,t,dst) HSUM_MMXEXT(a,t,dst)
  821. #define MMABS(a,z)    MMABS_MMXEXT(a,z)
  822. DCT_SAD_FUNC(mmxext)
  823. #undef HSUM
  824. #undef DCT_SAD
  825.  
  826. #define DCT_SAD       DCT_SAD_SSE2
  827. #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
  828. DCT_SAD_FUNC(sse2)
  829. #undef MMABS
  830.  
  831. #if HAVE_SSSE3_INLINE
  832. #define MMABS(a,z)    MMABS_SSSE3(a,z)
  833. DCT_SAD_FUNC(ssse3)
  834. #undef MMABS
  835. #endif
  836. #undef HSUM
  837. #undef DCT_SAD
  838.  
  839. static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
  840.     int sum;
  841.     x86_reg i=size;
  842.     __asm__ volatile(
  843.         "pxor %%mm4, %%mm4 \n"
  844.         "1: \n"
  845.         "sub $8, %0 \n"
  846.         "movq (%2,%0), %%mm2 \n"
  847.         "movq (%3,%0,2), %%mm0 \n"
  848.         "movq 8(%3,%0,2), %%mm1 \n"
  849.         "punpckhbw %%mm2, %%mm3 \n"
  850.         "punpcklbw %%mm2, %%mm2 \n"
  851.         "psraw $8, %%mm3 \n"
  852.         "psraw $8, %%mm2 \n"
  853.         "psubw %%mm3, %%mm1 \n"
  854.         "psubw %%mm2, %%mm0 \n"
  855.         "pmaddwd %%mm1, %%mm1 \n"
  856.         "pmaddwd %%mm0, %%mm0 \n"
  857.         "paddd %%mm1, %%mm4 \n"
  858.         "paddd %%mm0, %%mm4 \n"
  859.         "jg 1b \n"
  860.         "movq %%mm4, %%mm3 \n"
  861.         "psrlq $32, %%mm3 \n"
  862.         "paddd %%mm3, %%mm4 \n"
  863.         "movd %%mm4, %1 \n"
  864.         :"+r"(i), "=r"(sum)
  865.         :"r"(pix1), "r"(pix2)
  866.     );
  867.     return sum;
  868. }
  869.  
  870. #define PHADDD(a, t)\
  871.     "movq "#a", "#t"                  \n\t"\
  872.     "psrlq $32, "#a"                  \n\t"\
  873.     "paddd "#t", "#a"                 \n\t"
  874. /*
  875.    pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
  876.    pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
  877.    pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
  878.  */
  879. #define PMULHRW(x, y, s, o)\
  880.     "pmulhw " #s ", "#x "            \n\t"\
  881.     "pmulhw " #s ", "#y "            \n\t"\
  882.     "paddw " #o ", "#x "             \n\t"\
  883.     "paddw " #o ", "#y "             \n\t"\
  884.     "psraw $1, "#x "                 \n\t"\
  885.     "psraw $1, "#y "                 \n\t"
  886. #define DEF(x) x ## _mmx
  887. #define SET_RND MOVQ_WONE
  888. #define SCALE_OFFSET 1
  889.  
  890. #include "dsputil_qns_template.c"
  891.  
  892. #undef DEF
  893. #undef SET_RND
  894. #undef SCALE_OFFSET
  895. #undef PMULHRW
  896.  
  897. #define DEF(x) x ## _3dnow
  898. #define SET_RND(x)
  899. #define SCALE_OFFSET 0
  900. #define PMULHRW(x, y, s, o)\
  901.     "pmulhrw " #s ", "#x "           \n\t"\
  902.     "pmulhrw " #s ", "#y "           \n\t"
  903.  
  904. #include "dsputil_qns_template.c"
  905.  
  906. #undef DEF
  907. #undef SET_RND
  908. #undef SCALE_OFFSET
  909. #undef PMULHRW
  910.  
  911. #if HAVE_SSSE3_INLINE
  912. #undef PHADDD
  913. #define DEF(x) x ## _ssse3
  914. #define SET_RND(x)
  915. #define SCALE_OFFSET -1
  916. #define PHADDD(a, t)\
  917.     "pshufw $0x0E, "#a", "#t"         \n\t"\
  918.     "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
  919. #define PMULHRW(x, y, s, o)\
  920.     "pmulhrsw " #s ", "#x "          \n\t"\
  921.     "pmulhrsw " #s ", "#y "          \n\t"
  922.  
  923. #include "dsputil_qns_template.c"
  924.  
  925. #undef DEF
  926. #undef SET_RND
  927. #undef SCALE_OFFSET
  928. #undef PMULHRW
  929. #undef PHADDD
  930. #endif /* HAVE_SSSE3_INLINE */
  931.  
  932. #endif /* HAVE_INLINE_ASM */
  933.  
  934. int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
  935.  
  936. #define hadamard_func(cpu) \
  937. int ff_hadamard8_diff_##cpu  (void *s, uint8_t *src1, uint8_t *src2, \
  938.                               int stride, int h); \
  939. int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
  940.                               int stride, int h);
  941.  
  942. hadamard_func(mmx)
  943. hadamard_func(mmxext)
  944. hadamard_func(sse2)
  945. hadamard_func(ssse3)
  946.  
  947. av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx)
  948. {
  949.     int cpu_flags = av_get_cpu_flags();
  950.     const int dct_algo = avctx->dct_algo;
  951.  
  952. #if HAVE_YASM
  953.     int bit_depth = avctx->bits_per_raw_sample;
  954.  
  955.     if (EXTERNAL_MMX(cpu_flags)) {
  956.         if (bit_depth <= 8)
  957.             c->get_pixels = ff_get_pixels_mmx;
  958.         c->diff_pixels = ff_diff_pixels_mmx;
  959.         c->pix_sum = ff_pix_sum16_mmx;
  960.  
  961.         c->pix_norm1 = ff_pix_norm1_mmx;
  962.     }
  963.     if (EXTERNAL_SSE2(cpu_flags))
  964.         if (bit_depth <= 8)
  965.             c->get_pixels = ff_get_pixels_sse2;
  966. #endif /* HAVE_YASM */
  967.  
  968. #if HAVE_INLINE_ASM
  969.     if (INLINE_MMX(cpu_flags)) {
  970.         if (avctx->bits_per_raw_sample <= 8 &&
  971.             (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  972.             c->fdct = ff_fdct_mmx;
  973.  
  974.         c->diff_bytes= diff_bytes_mmx;
  975.         c->sum_abs_dctelem= sum_abs_dctelem_mmx;
  976.  
  977.         c->sse[0] = sse16_mmx;
  978.         c->sse[1] = sse8_mmx;
  979.         c->vsad[4]= vsad_intra16_mmx;
  980.  
  981.         c->nsse[0] = nsse16_mmx;
  982.         c->nsse[1] = nsse8_mmx;
  983.         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  984.             c->vsad[0] = vsad16_mmx;
  985.         }
  986.  
  987.         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  988.             c->try_8x8basis= try_8x8basis_mmx;
  989.         }
  990.         c->add_8x8basis= add_8x8basis_mmx;
  991.  
  992.         c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
  993.     }
  994.  
  995.     if (INLINE_MMXEXT(cpu_flags)) {
  996.         if (avctx->bits_per_raw_sample <= 8 &&
  997.             (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  998.             c->fdct = ff_fdct_mmxext;
  999.  
  1000.         c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
  1001.         c->vsad[4]         = vsad_intra16_mmxext;
  1002.  
  1003.         if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
  1004.             c->vsad[0] = vsad16_mmxext;
  1005.         }
  1006.  
  1007.         c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext;
  1008.     }
  1009.  
  1010.     if (INLINE_SSE2(cpu_flags)) {
  1011.         if (avctx->bits_per_raw_sample <= 8 &&
  1012.             (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  1013.             c->fdct = ff_fdct_sse2;
  1014.  
  1015.         c->sum_abs_dctelem= sum_abs_dctelem_sse2;
  1016.     }
  1017.  
  1018. #if HAVE_SSSE3_INLINE
  1019.     if (INLINE_SSSE3(cpu_flags)) {
  1020.         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  1021.             c->try_8x8basis = try_8x8basis_ssse3;
  1022.         }
  1023.         c->add_8x8basis    = add_8x8basis_ssse3;
  1024.         c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
  1025.     }
  1026. #endif
  1027.  
  1028.     if (INLINE_AMD3DNOW(cpu_flags)) {
  1029.         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  1030.             c->try_8x8basis = try_8x8basis_3dnow;
  1031.         }
  1032.         c->add_8x8basis = add_8x8basis_3dnow;
  1033.     }
  1034. #endif /* HAVE_INLINE_ASM */
  1035.  
  1036.     if (EXTERNAL_MMX(cpu_flags)) {
  1037.         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
  1038.         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
  1039.     }
  1040.  
  1041.     if (EXTERNAL_MMXEXT(cpu_flags)) {
  1042.         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
  1043.         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
  1044.     }
  1045.  
  1046.     if (EXTERNAL_SSE2(cpu_flags)) {
  1047.         c->sse[0] = ff_sse16_sse2;
  1048.  
  1049. #if HAVE_ALIGNED_STACK
  1050.         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
  1051.         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
  1052. #endif
  1053.     }
  1054.  
  1055.     if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
  1056.         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
  1057.         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
  1058.     }
  1059.  
  1060.     ff_dsputil_init_pix_mmx(c, avctx);
  1061. }
  1062.