Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright (c) 2002 Brian Foley
  3.  * Copyright (c) 2002 Dieter Shirley
  4.  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  5.  *
  6.  * This file is part of FFmpeg.
  7.  *
  8.  * FFmpeg is free software; you can redistribute it and/or
  9.  * modify it under the terms of the GNU Lesser General Public
  10.  * License as published by the Free Software Foundation; either
  11.  * version 2.1 of the License, or (at your option) any later version.
  12.  *
  13.  * FFmpeg is distributed in the hope that it will be useful,
  14.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16.  * Lesser General Public License for more details.
  17.  *
  18.  * You should have received a copy of the GNU Lesser General Public
  19.  * License along with FFmpeg; if not, write to the Free Software
  20.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21.  */
  22.  
  23. #include "config.h"
  24. #if HAVE_ALTIVEC_H
  25. #include <altivec.h>
  26. #endif
  27. #include "libavutil/attributes.h"
  28. #include "libavutil/ppc/types_altivec.h"
  29. #include "libavutil/ppc/util_altivec.h"
  30. #include "libavcodec/dsputil.h"
  31. #include "dsputil_altivec.h"
  32.  
  33. static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  34. {
  35.     int i;
  36.     int s;
  37.     const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  38.     vector unsigned char perm1 = vec_lvsl(0, pix2);
  39.     vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
  40.     vector unsigned char pix2l, pix2r;
  41.     vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  42.     vector unsigned int sad;
  43.     vector signed int sumdiffs;
  44.  
  45.     s = 0;
  46.     sad = (vector unsigned int)vec_splat_u32(0);
  47.     for (i = 0; i < h; i++) {
  48.         /* Read unaligned pixels into our vectors. The vectors are as follows:
  49.            pix1v: pix1[0]-pix1[15]
  50.            pix2v: pix2[0]-pix2[15]      pix2iv: pix2[1]-pix2[16] */
  51.         pix1v  = vec_ld( 0, pix1);
  52.         pix2l  = vec_ld( 0, pix2);
  53.         pix2r  = vec_ld(16, pix2);
  54.         pix2v  = vec_perm(pix2l, pix2r, perm1);
  55.         pix2iv = vec_perm(pix2l, pix2r, perm2);
  56.  
  57.         /* Calculate the average vector */
  58.         avgv = vec_avg(pix2v, pix2iv);
  59.  
  60.         /* Calculate a sum of abs differences vector */
  61.         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  62.  
  63.         /* Add each 4 pixel group together and put 4 results into sad */
  64.         sad = vec_sum4s(t5, sad);
  65.  
  66.         pix1 += line_size;
  67.         pix2 += line_size;
  68.     }
  69.     /* Sum up the four partial sums, and put the result into s */
  70.     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  71.     sumdiffs = vec_splat(sumdiffs, 3);
  72.     vec_ste(sumdiffs, 0, &s);
  73.  
  74.     return s;
  75. }
  76.  
  77. static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  78. {
  79.     int i;
  80.     int s;
  81.     const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  82.     vector unsigned char perm = vec_lvsl(0, pix2);
  83.     vector unsigned char pix2l, pix2r;
  84.     vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  85.     vector unsigned int sad;
  86.     vector signed int sumdiffs;
  87.     uint8_t *pix3 = pix2 + line_size;
  88.  
  89.     s = 0;
  90.     sad = (vector unsigned int)vec_splat_u32(0);
  91.  
  92.     /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  93.        iteration becomes pix2 in the next iteration. We can use this
  94.        fact to avoid a potentially expensive unaligned read, each
  95.        time around the loop.
  96.        Read unaligned pixels into our vectors. The vectors are as follows:
  97.        pix2v: pix2[0]-pix2[15]
  98.        Split the pixel vectors into shorts */
  99.     pix2l = vec_ld( 0, pix2);
  100.     pix2r = vec_ld(15, pix2);
  101.     pix2v = vec_perm(pix2l, pix2r, perm);
  102.  
  103.     for (i = 0; i < h; i++) {
  104.         /* Read unaligned pixels into our vectors. The vectors are as follows:
  105.            pix1v: pix1[0]-pix1[15]
  106.            pix3v: pix3[0]-pix3[15] */
  107.         pix1v = vec_ld(0, pix1);
  108.  
  109.         pix2l = vec_ld( 0, pix3);
  110.         pix2r = vec_ld(15, pix3);
  111.         pix3v = vec_perm(pix2l, pix2r, perm);
  112.  
  113.         /* Calculate the average vector */
  114.         avgv = vec_avg(pix2v, pix3v);
  115.  
  116.         /* Calculate a sum of abs differences vector */
  117.         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  118.  
  119.         /* Add each 4 pixel group together and put 4 results into sad */
  120.         sad = vec_sum4s(t5, sad);
  121.  
  122.         pix1 += line_size;
  123.         pix2v = pix3v;
  124.         pix3 += line_size;
  125.  
  126.     }
  127.  
  128.     /* Sum up the four partial sums, and put the result into s */
  129.     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  130.     sumdiffs = vec_splat(sumdiffs, 3);
  131.     vec_ste(sumdiffs, 0, &s);
  132.     return s;
  133. }
  134.  
  135. static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  136. {
  137.     int i;
  138.     int s;
  139.     uint8_t *pix3 = pix2 + line_size;
  140.     const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  141.     const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
  142.     vector unsigned char avgv, t5;
  143.     vector unsigned char perm1 = vec_lvsl(0, pix2);
  144.     vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
  145.     vector unsigned char pix2l, pix2r;
  146.     vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
  147.     vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
  148.     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
  149.     vector unsigned short avghv, avglv;
  150.     vector unsigned short t1, t2, t3, t4;
  151.     vector unsigned int sad;
  152.     vector signed int sumdiffs;
  153.  
  154.     sad = (vector unsigned int)vec_splat_u32(0);
  155.  
  156.     s = 0;
  157.  
  158.     /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  159.        iteration becomes pix2 in the next iteration. We can use this
  160.        fact to avoid a potentially expensive unaligned read, as well
  161.        as some splitting, and vector addition each time around the loop.
  162.        Read unaligned pixels into our vectors. The vectors are as follows:
  163.        pix2v: pix2[0]-pix2[15]  pix2iv: pix2[1]-pix2[16]
  164.        Split the pixel vectors into shorts */
  165.     pix2l  = vec_ld( 0, pix2);
  166.     pix2r  = vec_ld(16, pix2);
  167.     pix2v  = vec_perm(pix2l, pix2r, perm1);
  168.     pix2iv = vec_perm(pix2l, pix2r, perm2);
  169.  
  170.     pix2hv  = (vector unsigned short) vec_mergeh(zero, pix2v);
  171.     pix2lv  = (vector unsigned short) vec_mergel(zero, pix2v);
  172.     pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
  173.     pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
  174.     t1 = vec_add(pix2hv, pix2ihv);
  175.     t2 = vec_add(pix2lv, pix2ilv);
  176.  
  177.     for (i = 0; i < h; i++) {
  178.         /* Read unaligned pixels into our vectors. The vectors are as follows:
  179.            pix1v: pix1[0]-pix1[15]
  180.            pix3v: pix3[0]-pix3[15]      pix3iv: pix3[1]-pix3[16] */
  181.         pix1v = vec_ld(0, pix1);
  182.  
  183.         pix2l  = vec_ld( 0, pix3);
  184.         pix2r  = vec_ld(16, pix3);
  185.         pix3v  = vec_perm(pix2l, pix2r, perm1);
  186.         pix3iv = vec_perm(pix2l, pix2r, perm2);
  187.  
  188.         /* Note that AltiVec does have vec_avg, but this works on vector pairs
  189.            and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
  190.            would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
  191.            Instead, we have to split the pixel vectors into vectors of shorts,
  192.            and do the averaging by hand. */
  193.  
  194.         /* Split the pixel vectors into shorts */
  195.         pix3hv  = (vector unsigned short) vec_mergeh(zero, pix3v);
  196.         pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
  197.         pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
  198.         pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
  199.  
  200.         /* Do the averaging on them */
  201.         t3 = vec_add(pix3hv, pix3ihv);
  202.         t4 = vec_add(pix3lv, pix3ilv);
  203.  
  204.         avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
  205.         avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
  206.  
  207.         /* Pack the shorts back into a result */
  208.         avgv = vec_pack(avghv, avglv);
  209.  
  210.         /* Calculate a sum of abs differences vector */
  211.         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  212.  
  213.         /* Add each 4 pixel group together and put 4 results into sad */
  214.         sad = vec_sum4s(t5, sad);
  215.  
  216.         pix1 += line_size;
  217.         pix3 += line_size;
  218.         /* Transfer the calculated values for pix3 into pix2 */
  219.         t1 = t3;
  220.         t2 = t4;
  221.     }
  222.     /* Sum up the four partial sums, and put the result into s */
  223.     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  224.     sumdiffs = vec_splat(sumdiffs, 3);
  225.     vec_ste(sumdiffs, 0, &s);
  226.  
  227.     return s;
  228. }
  229.  
  230. static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  231. {
  232.     int i;
  233.     int s;
  234.     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  235.     vector unsigned char perm = vec_lvsl(0, pix2);
  236.     vector unsigned char t1, t2, t3,t4, t5;
  237.     vector unsigned int sad;
  238.     vector signed int sumdiffs;
  239.  
  240.     sad = (vector unsigned int)vec_splat_u32(0);
  241.  
  242.  
  243.     for (i = 0; i < h; i++) {
  244.         /* Read potentially unaligned pixels into t1 and t2 */
  245.         vector unsigned char pix2l = vec_ld( 0, pix2);
  246.         vector unsigned char pix2r = vec_ld(15, pix2);
  247.         t1 = vec_ld(0, pix1);
  248.         t2 = vec_perm(pix2l, pix2r, perm);
  249.  
  250.         /* Calculate a sum of abs differences vector */
  251.         t3 = vec_max(t1, t2);
  252.         t4 = vec_min(t1, t2);
  253.         t5 = vec_sub(t3, t4);
  254.  
  255.         /* Add each 4 pixel group together and put 4 results into sad */
  256.         sad = vec_sum4s(t5, sad);
  257.  
  258.         pix1 += line_size;
  259.         pix2 += line_size;
  260.     }
  261.  
  262.     /* Sum up the four partial sums, and put the result into s */
  263.     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  264.     sumdiffs = vec_splat(sumdiffs, 3);
  265.     vec_ste(sumdiffs, 0, &s);
  266.  
  267.     return s;
  268. }
  269.  
  270. static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  271. {
  272.     int i;
  273.     int s;
  274.     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  275.     const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
  276.     vector unsigned char perm1 = vec_lvsl(0, pix1);
  277.     vector unsigned char perm2 = vec_lvsl(0, pix2);
  278.     vector unsigned char t1, t2, t3,t4, t5;
  279.     vector unsigned int sad;
  280.     vector signed int sumdiffs;
  281.  
  282.     sad = (vector unsigned int)vec_splat_u32(0);
  283.  
  284.     for (i = 0; i < h; i++) {
  285.         /* Read potentially unaligned pixels into t1 and t2
  286.            Since we're reading 16 pixels, and actually only want 8,
  287.            mask out the last 8 pixels. The 0s don't change the sum. */
  288.         vector unsigned char pix1l = vec_ld( 0, pix1);
  289.         vector unsigned char pix1r = vec_ld(15, pix1);
  290.         vector unsigned char pix2l = vec_ld( 0, pix2);
  291.         vector unsigned char pix2r = vec_ld(15, pix2);
  292.         t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
  293.         t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
  294.  
  295.         /* Calculate a sum of abs differences vector */
  296.         t3 = vec_max(t1, t2);
  297.         t4 = vec_min(t1, t2);
  298.         t5 = vec_sub(t3, t4);
  299.  
  300.         /* Add each 4 pixel group together and put 4 results into sad */
  301.         sad = vec_sum4s(t5, sad);
  302.  
  303.         pix1 += line_size;
  304.         pix2 += line_size;
  305.     }
  306.  
  307.     /* Sum up the four partial sums, and put the result into s */
  308.     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  309.     sumdiffs = vec_splat(sumdiffs, 3);
  310.     vec_ste(sumdiffs, 0, &s);
  311.  
  312.     return s;
  313. }
  314.  
  315. static int pix_norm1_altivec(uint8_t *pix, int line_size)
  316. {
  317.     int i;
  318.     int s;
  319.     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  320.     vector unsigned char perm = vec_lvsl(0, pix);
  321.     vector unsigned char pixv;
  322.     vector unsigned int sv;
  323.     vector signed int sum;
  324.  
  325.     sv = (vector unsigned int)vec_splat_u32(0);
  326.  
  327.     s = 0;
  328.     for (i = 0; i < 16; i++) {
  329.         /* Read in the potentially unaligned pixels */
  330.         vector unsigned char pixl = vec_ld( 0, pix);
  331.         vector unsigned char pixr = vec_ld(15, pix);
  332.         pixv = vec_perm(pixl, pixr, perm);
  333.  
  334.         /* Square the values, and add them to our sum */
  335.         sv = vec_msum(pixv, pixv, sv);
  336.  
  337.         pix += line_size;
  338.     }
  339.     /* Sum up the four partial sums, and put the result into s */
  340.     sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  341.     sum = vec_splat(sum, 3);
  342.     vec_ste(sum, 0, &s);
  343.  
  344.     return s;
  345. }
  346.  
  347. /**
  348.  * Sum of Squared Errors for a 8x8 block.
  349.  * AltiVec-enhanced.
  350.  * It's the sad8_altivec code above w/ squaring added.
  351.  */
  352. static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  353. {
  354.     int i;
  355.     int s;
  356.     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  357.     const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
  358.     vector unsigned char perm1 = vec_lvsl(0, pix1);
  359.     vector unsigned char perm2 = vec_lvsl(0, pix2);
  360.     vector unsigned char t1, t2, t3,t4, t5;
  361.     vector unsigned int sum;
  362.     vector signed int sumsqr;
  363.  
  364.     sum = (vector unsigned int)vec_splat_u32(0);
  365.  
  366.     for (i = 0; i < h; i++) {
  367.         /* Read potentially unaligned pixels into t1 and t2
  368.            Since we're reading 16 pixels, and actually only want 8,
  369.            mask out the last 8 pixels. The 0s don't change the sum. */
  370.         vector unsigned char pix1l = vec_ld( 0, pix1);
  371.         vector unsigned char pix1r = vec_ld(15, pix1);
  372.         vector unsigned char pix2l = vec_ld( 0, pix2);
  373.         vector unsigned char pix2r = vec_ld(15, pix2);
  374.         t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
  375.         t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
  376.  
  377.         /* Since we want to use unsigned chars, we can take advantage
  378.            of the fact that abs(a-b)^2 = (a-b)^2. */
  379.  
  380.         /* Calculate abs differences vector */
  381.         t3 = vec_max(t1, t2);
  382.         t4 = vec_min(t1, t2);
  383.         t5 = vec_sub(t3, t4);
  384.  
  385.         /* Square the values and add them to our sum */
  386.         sum = vec_msum(t5, t5, sum);
  387.  
  388.         pix1 += line_size;
  389.         pix2 += line_size;
  390.     }
  391.  
  392.     /* Sum up the four partial sums, and put the result into s */
  393.     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  394.     sumsqr = vec_splat(sumsqr, 3);
  395.     vec_ste(sumsqr, 0, &s);
  396.  
  397.     return s;
  398. }
  399.  
  400. /**
  401.  * Sum of Squared Errors for a 16x16 block.
  402.  * AltiVec-enhanced.
  403.  * It's the sad16_altivec code above w/ squaring added.
  404.  */
  405. static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  406. {
  407.     int i;
  408.     int s;
  409.     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  410.     vector unsigned char perm = vec_lvsl(0, pix2);
  411.     vector unsigned char t1, t2, t3,t4, t5;
  412.     vector unsigned int sum;
  413.     vector signed int sumsqr;
  414.  
  415.     sum = (vector unsigned int)vec_splat_u32(0);
  416.  
  417.     for (i = 0; i < h; i++) {
  418.         /* Read potentially unaligned pixels into t1 and t2 */
  419.         vector unsigned char pix2l = vec_ld( 0, pix2);
  420.         vector unsigned char pix2r = vec_ld(15, pix2);
  421.         t1 = vec_ld(0, pix1);
  422.         t2 = vec_perm(pix2l, pix2r, perm);
  423.  
  424.         /* Since we want to use unsigned chars, we can take advantage
  425.            of the fact that abs(a-b)^2 = (a-b)^2. */
  426.  
  427.         /* Calculate abs differences vector */
  428.         t3 = vec_max(t1, t2);
  429.         t4 = vec_min(t1, t2);
  430.         t5 = vec_sub(t3, t4);
  431.  
  432.         /* Square the values and add them to our sum */
  433.         sum = vec_msum(t5, t5, sum);
  434.  
  435.         pix1 += line_size;
  436.         pix2 += line_size;
  437.     }
  438.  
  439.     /* Sum up the four partial sums, and put the result into s */
  440.     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  441.     sumsqr = vec_splat(sumsqr, 3);
  442.     vec_ste(sumsqr, 0, &s);
  443.  
  444.     return s;
  445. }
  446.  
  447. static int pix_sum_altivec(uint8_t * pix, int line_size)
  448. {
  449.     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  450.     vector unsigned char perm = vec_lvsl(0, pix);
  451.     vector unsigned char t1;
  452.     vector unsigned int sad;
  453.     vector signed int sumdiffs;
  454.  
  455.     int i;
  456.     int s;
  457.  
  458.     sad = (vector unsigned int)vec_splat_u32(0);
  459.  
  460.     for (i = 0; i < 16; i++) {
  461.         /* Read the potentially unaligned 16 pixels into t1 */
  462.         vector unsigned char pixl = vec_ld( 0, pix);
  463.         vector unsigned char pixr = vec_ld(15, pix);
  464.         t1 = vec_perm(pixl, pixr, perm);
  465.  
  466.         /* Add each 4 pixel group together and put 4 results into sad */
  467.         sad = vec_sum4s(t1, sad);
  468.  
  469.         pix += line_size;
  470.     }
  471.  
  472.     /* Sum up the four partial sums, and put the result into s */
  473.     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  474.     sumdiffs = vec_splat(sumdiffs, 3);
  475.     vec_ste(sumdiffs, 0, &s);
  476.  
  477.     return s;
  478. }
  479.  
  480. static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, int line_size)
  481. {
  482.     int i;
  483.     vector unsigned char perm = vec_lvsl(0, pixels);
  484.     vector unsigned char bytes;
  485.     const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  486.     vector signed short shorts;
  487.  
  488.     for (i = 0; i < 8; i++) {
  489.         // Read potentially unaligned pixels.
  490.         // We're reading 16 pixels, and actually only want 8,
  491.         // but we simply ignore the extras.
  492.         vector unsigned char pixl = vec_ld( 0, pixels);
  493.         vector unsigned char pixr = vec_ld(15, pixels);
  494.         bytes = vec_perm(pixl, pixr, perm);
  495.  
  496.         // convert the bytes into shorts
  497.         shorts = (vector signed short)vec_mergeh(zero, bytes);
  498.  
  499.         // save the data to the block, we assume the block is 16-byte aligned
  500.         vec_st(shorts, i*16, (vector signed short*)block);
  501.  
  502.         pixels += line_size;
  503.     }
  504. }
  505.  
  506. static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
  507.         const uint8_t *s2, int stride)
  508. {
  509.     int i;
  510.     vector unsigned char perm1 = vec_lvsl(0, s1);
  511.     vector unsigned char perm2 = vec_lvsl(0, s2);
  512.     vector unsigned char bytes, pixl, pixr;
  513.     const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  514.     vector signed short shorts1, shorts2;
  515.  
  516.     for (i = 0; i < 4; i++) {
  517.         // Read potentially unaligned pixels
  518.         // We're reading 16 pixels, and actually only want 8,
  519.         // but we simply ignore the extras.
  520.         pixl = vec_ld( 0, s1);
  521.         pixr = vec_ld(15, s1);
  522.         bytes = vec_perm(pixl, pixr, perm1);
  523.  
  524.         // convert the bytes into shorts
  525.         shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  526.  
  527.         // Do the same for the second block of pixels
  528.         pixl = vec_ld( 0, s2);
  529.         pixr = vec_ld(15, s2);
  530.         bytes = vec_perm(pixl, pixr, perm2);
  531.  
  532.         // convert the bytes into shorts
  533.         shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  534.  
  535.         // Do the subtraction
  536.         shorts1 = vec_sub(shorts1, shorts2);
  537.  
  538.         // save the data to the block, we assume the block is 16-byte aligned
  539.         vec_st(shorts1, 0, (vector signed short*)block);
  540.  
  541.         s1 += stride;
  542.         s2 += stride;
  543.         block += 8;
  544.  
  545.  
  546.         // The code below is a copy of the code above... This is a manual
  547.         // unroll.
  548.  
  549.         // Read potentially unaligned pixels
  550.         // We're reading 16 pixels, and actually only want 8,
  551.         // but we simply ignore the extras.
  552.         pixl = vec_ld( 0, s1);
  553.         pixr = vec_ld(15, s1);
  554.         bytes = vec_perm(pixl, pixr, perm1);
  555.  
  556.         // convert the bytes into shorts
  557.         shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  558.  
  559.         // Do the same for the second block of pixels
  560.         pixl = vec_ld( 0, s2);
  561.         pixr = vec_ld(15, s2);
  562.         bytes = vec_perm(pixl, pixr, perm2);
  563.  
  564.         // convert the bytes into shorts
  565.         shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  566.  
  567.         // Do the subtraction
  568.         shorts1 = vec_sub(shorts1, shorts2);
  569.  
  570.         // save the data to the block, we assume the block is 16-byte aligned
  571.         vec_st(shorts1, 0, (vector signed short*)block);
  572.  
  573.         s1 += stride;
  574.         s2 += stride;
  575.         block += 8;
  576.     }
  577. }
  578.  
  579.  
  580. static void clear_block_altivec(int16_t *block) {
  581.     LOAD_ZERO;
  582.     vec_st(zero_s16v,   0, block);
  583.     vec_st(zero_s16v,  16, block);
  584.     vec_st(zero_s16v,  32, block);
  585.     vec_st(zero_s16v,  48, block);
  586.     vec_st(zero_s16v,  64, block);
  587.     vec_st(zero_s16v,  80, block);
  588.     vec_st(zero_s16v,  96, block);
  589.     vec_st(zero_s16v, 112, block);
  590. }
  591.  
  592.  
  593. static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
  594.     register int i;
  595.     register vector unsigned char vdst, vsrc;
  596.  
  597.     /* dst and src are 16 bytes-aligned (guaranteed) */
  598.     for (i = 0 ; (i + 15) < w ; i+=16) {
  599.         vdst = vec_ld(i, (unsigned char*)dst);
  600.         vsrc = vec_ld(i, (unsigned char*)src);
  601.         vdst = vec_add(vsrc, vdst);
  602.         vec_st(vdst, i, (unsigned char*)dst);
  603.     }
  604.     /* if w is not a multiple of 16 */
  605.     for (; (i < w) ; i++) {
  606.         dst[i] = src[i];
  607.     }
  608. }
  609.  
  610. static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  611.     int sum;
  612.     register const vector unsigned char vzero =
  613.                             (const vector unsigned char)vec_splat_u8(0);
  614.     register vector signed short temp0, temp1, temp2, temp3, temp4,
  615.                                  temp5, temp6, temp7;
  616.     {
  617.     register const vector signed short vprod1 =(const vector signed short)
  618.                                                { 1,-1, 1,-1, 1,-1, 1,-1 };
  619.     register const vector signed short vprod2 =(const vector signed short)
  620.                                                { 1, 1,-1,-1, 1, 1,-1,-1 };
  621.     register const vector signed short vprod3 =(const vector signed short)
  622.                                                { 1, 1, 1, 1,-1,-1,-1,-1 };
  623.     register const vector unsigned char perm1 = (const vector unsigned char)
  624.         {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  625.          0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
  626.     register const vector unsigned char perm2 = (const vector unsigned char)
  627.         {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  628.          0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
  629.     register const vector unsigned char perm3 = (const vector unsigned char)
  630.         {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  631.          0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
  632.  
  633. #define ONEITERBUTTERFLY(i, res)                                          \
  634.     {                                                                     \
  635.     register vector unsigned char src1, src2, srcO;                   \
  636.     register vector unsigned char dst1, dst2, dstO;                   \
  637.     register vector signed short srcV, dstV;                          \
  638.     register vector signed short but0, but1, but2, op1, op2, op3;     \
  639.     src1 = vec_ld(stride * i, src);                                   \
  640.     src2 = vec_ld((stride * i) + 15, src);                            \
  641.     srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src));           \
  642.     dst1 = vec_ld(stride * i, dst);                                   \
  643.     dst2 = vec_ld((stride * i) + 15, dst);                            \
  644.     dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));           \
  645.     /* promote the unsigned chars to signed shorts */                 \
  646.     /* we're in the 8x8 function, we only care for the first 8 */     \
  647.     srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
  648.            (vector signed char)srcO);                                 \
  649.     dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
  650.            (vector signed char)dstO);                                 \
  651.     /* subtractions inside the first butterfly */                     \
  652.     but0 = vec_sub(srcV, dstV);                                       \
  653.     op1  = vec_perm(but0, but0, perm1);                               \
  654.     but1 = vec_mladd(but0, vprod1, op1);                              \
  655.     op2  = vec_perm(but1, but1, perm2);                               \
  656.     but2 = vec_mladd(but1, vprod2, op2);                              \
  657.     op3  = vec_perm(but2, but2, perm3);                               \
  658.     res  = vec_mladd(but2, vprod3, op3);                              \
  659.     }
  660.     ONEITERBUTTERFLY(0, temp0);
  661.     ONEITERBUTTERFLY(1, temp1);
  662.     ONEITERBUTTERFLY(2, temp2);
  663.     ONEITERBUTTERFLY(3, temp3);
  664.     ONEITERBUTTERFLY(4, temp4);
  665.     ONEITERBUTTERFLY(5, temp5);
  666.     ONEITERBUTTERFLY(6, temp6);
  667.     ONEITERBUTTERFLY(7, temp7);
  668.     }
  669. #undef ONEITERBUTTERFLY
  670.     {
  671.     register vector signed int vsum;
  672.     register vector signed short line0 = vec_add(temp0, temp1);
  673.     register vector signed short line1 = vec_sub(temp0, temp1);
  674.     register vector signed short line2 = vec_add(temp2, temp3);
  675.     register vector signed short line3 = vec_sub(temp2, temp3);
  676.     register vector signed short line4 = vec_add(temp4, temp5);
  677.     register vector signed short line5 = vec_sub(temp4, temp5);
  678.     register vector signed short line6 = vec_add(temp6, temp7);
  679.     register vector signed short line7 = vec_sub(temp6, temp7);
  680.  
  681.     register vector signed short line0B = vec_add(line0, line2);
  682.     register vector signed short line2B = vec_sub(line0, line2);
  683.     register vector signed short line1B = vec_add(line1, line3);
  684.     register vector signed short line3B = vec_sub(line1, line3);
  685.     register vector signed short line4B = vec_add(line4, line6);
  686.     register vector signed short line6B = vec_sub(line4, line6);
  687.     register vector signed short line5B = vec_add(line5, line7);
  688.     register vector signed short line7B = vec_sub(line5, line7);
  689.  
  690.     register vector signed short line0C = vec_add(line0B, line4B);
  691.     register vector signed short line4C = vec_sub(line0B, line4B);
  692.     register vector signed short line1C = vec_add(line1B, line5B);
  693.     register vector signed short line5C = vec_sub(line1B, line5B);
  694.     register vector signed short line2C = vec_add(line2B, line6B);
  695.     register vector signed short line6C = vec_sub(line2B, line6B);
  696.     register vector signed short line3C = vec_add(line3B, line7B);
  697.     register vector signed short line7C = vec_sub(line3B, line7B);
  698.  
  699.     vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  700.     vsum = vec_sum4s(vec_abs(line1C), vsum);
  701.     vsum = vec_sum4s(vec_abs(line2C), vsum);
  702.     vsum = vec_sum4s(vec_abs(line3C), vsum);
  703.     vsum = vec_sum4s(vec_abs(line4C), vsum);
  704.     vsum = vec_sum4s(vec_abs(line5C), vsum);
  705.     vsum = vec_sum4s(vec_abs(line6C), vsum);
  706.     vsum = vec_sum4s(vec_abs(line7C), vsum);
  707.     vsum = vec_sums(vsum, (vector signed int)vzero);
  708.     vsum = vec_splat(vsum, 3);
  709.     vec_ste(vsum, 0, &sum);
  710.     }
  711.     return sum;
  712. }
  713.  
  714. /*
  715. 16x8 works with 16 elements; it allows to avoid replicating loads, and
  716. give the compiler more rooms for scheduling.  It's only used from
  717. inside hadamard8_diff16_altivec.
  718.  
  719. Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT
  720. of spill code, it seems gcc (unlike xlc) cannot keep everything in registers
  721. by itself. The following code include hand-made registers allocation. It's not
  722. clean, but on a 7450 the resulting code is much faster (best case fall from
  723. 700+ cycles to 550).
  724.  
  725. xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,
  726. and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less
  727. instructions...)
  728.  
  729. On the 970, the hand-made RA is still a win (around 690 vs. around 780), but
  730. xlc goes to around 660 on the regular C code...
  731. */
  732.  
  733. static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
  734.     int sum;
  735.     register vector signed short
  736.         temp0 __asm__ ("v0"),
  737.         temp1 __asm__ ("v1"),
  738.         temp2 __asm__ ("v2"),
  739.         temp3 __asm__ ("v3"),
  740.         temp4 __asm__ ("v4"),
  741.         temp5 __asm__ ("v5"),
  742.         temp6 __asm__ ("v6"),
  743.         temp7 __asm__ ("v7");
  744.     register vector signed short
  745.         temp0S __asm__ ("v8"),
  746.         temp1S __asm__ ("v9"),
  747.         temp2S __asm__ ("v10"),
  748.         temp3S __asm__ ("v11"),
  749.         temp4S __asm__ ("v12"),
  750.         temp5S __asm__ ("v13"),
  751.         temp6S __asm__ ("v14"),
  752.         temp7S __asm__ ("v15");
  753.     register const vector unsigned char vzero __asm__ ("v31") =
  754.         (const vector unsigned char)vec_splat_u8(0);
  755.     {
  756.     register const vector signed short vprod1 __asm__ ("v16") =
  757.         (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 };
  758.     register const vector signed short vprod2 __asm__ ("v17") =
  759.         (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 };
  760.     register const vector signed short vprod3 __asm__ ("v18") =
  761.         (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 };
  762.     register const vector unsigned char perm1 __asm__ ("v19") =
  763.         (const vector unsigned char)
  764.         {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  765.          0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
  766.     register const vector unsigned char perm2 __asm__ ("v20") =
  767.         (const vector unsigned char)
  768.         {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  769.          0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
  770.     register const vector unsigned char perm3 __asm__ ("v21") =
  771.         (const vector unsigned char)
  772.         {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  773.          0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
  774.  
  775. #define ONEITERBUTTERFLY(i, res1, res2)                               \
  776.     {                                                                 \
  777.     register vector unsigned char src1 __asm__ ("v22"),               \
  778.                                   src2 __asm__ ("v23"),               \
  779.                                   dst1 __asm__ ("v24"),               \
  780.                                   dst2 __asm__ ("v25"),               \
  781.                                   srcO __asm__ ("v22"),               \
  782.                                   dstO __asm__ ("v23");               \
  783.                                                                       \
  784.     register vector signed short  srcV  __asm__ ("v24"),              \
  785.                                   dstV  __asm__ ("v25"),              \
  786.                                   srcW  __asm__ ("v26"),              \
  787.                                   dstW  __asm__ ("v27"),              \
  788.                                   but0  __asm__ ("v28"),              \
  789.                                   but0S __asm__ ("v29"),              \
  790.                                   op1   __asm__ ("v30"),              \
  791.                                   but1  __asm__ ("v22"),              \
  792.                                   op1S  __asm__ ("v23"),              \
  793.                                   but1S __asm__ ("v24"),              \
  794.                                   op2   __asm__ ("v25"),              \
  795.                                   but2  __asm__ ("v26"),              \
  796.                                   op2S  __asm__ ("v27"),              \
  797.                                   but2S __asm__ ("v28"),              \
  798.                                   op3   __asm__ ("v29"),              \
  799.                                   op3S  __asm__ ("v30");              \
  800.                                                                       \
  801.     src1 = vec_ld(stride * i, src);                                   \
  802.     src2 = vec_ld((stride * i) + 16, src);                            \
  803.     srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src));           \
  804.     dst1 = vec_ld(stride * i, dst);                                   \
  805.     dst2 = vec_ld((stride * i) + 16, dst);                            \
  806.     dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));           \
  807.     /* promote the unsigned chars to signed shorts */                 \
  808.     srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
  809.            (vector signed char)srcO);                                 \
  810.     dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
  811.            (vector signed char)dstO);                                 \
  812.     srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
  813.            (vector signed char)srcO);                                 \
  814.     dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
  815.            (vector signed char)dstO);                                 \
  816.     /* subtractions inside the first butterfly */                     \
  817.     but0 = vec_sub(srcV, dstV);                                       \
  818.     but0S = vec_sub(srcW, dstW);                                      \
  819.     op1 = vec_perm(but0, but0, perm1);                                \
  820.     but1 = vec_mladd(but0, vprod1, op1);                              \
  821.     op1S = vec_perm(but0S, but0S, perm1);                             \
  822.     but1S = vec_mladd(but0S, vprod1, op1S);                           \
  823.     op2 = vec_perm(but1, but1, perm2);                                \
  824.     but2 = vec_mladd(but1, vprod2, op2);                              \
  825.     op2S = vec_perm(but1S, but1S, perm2);                             \
  826.     but2S = vec_mladd(but1S, vprod2, op2S);                           \
  827.     op3 = vec_perm(but2, but2, perm3);                                \
  828.     res1 = vec_mladd(but2, vprod3, op3);                              \
  829.     op3S = vec_perm(but2S, but2S, perm3);                             \
  830.     res2 = vec_mladd(but2S, vprod3, op3S);                            \
  831.     }
  832.     ONEITERBUTTERFLY(0, temp0, temp0S);
  833.     ONEITERBUTTERFLY(1, temp1, temp1S);
  834.     ONEITERBUTTERFLY(2, temp2, temp2S);
  835.     ONEITERBUTTERFLY(3, temp3, temp3S);
  836.     ONEITERBUTTERFLY(4, temp4, temp4S);
  837.     ONEITERBUTTERFLY(5, temp5, temp5S);
  838.     ONEITERBUTTERFLY(6, temp6, temp6S);
  839.     ONEITERBUTTERFLY(7, temp7, temp7S);
  840.     }
  841. #undef ONEITERBUTTERFLY
  842.     {
  843.     register vector signed int vsum;
  844.     register vector signed short line0S, line1S, line2S, line3S, line4S,
  845.                                  line5S, line6S, line7S, line0BS,line2BS,
  846.                                  line1BS,line3BS,line4BS,line6BS,line5BS,
  847.                                  line7BS,line0CS,line4CS,line1CS,line5CS,
  848.                                  line2CS,line6CS,line3CS,line7CS;
  849.  
  850.     register vector signed short line0 = vec_add(temp0, temp1);
  851.     register vector signed short line1 = vec_sub(temp0, temp1);
  852.     register vector signed short line2 = vec_add(temp2, temp3);
  853.     register vector signed short line3 = vec_sub(temp2, temp3);
  854.     register vector signed short line4 = vec_add(temp4, temp5);
  855.     register vector signed short line5 = vec_sub(temp4, temp5);
  856.     register vector signed short line6 = vec_add(temp6, temp7);
  857.     register vector signed short line7 = vec_sub(temp6, temp7);
  858.  
  859.     register vector signed short line0B = vec_add(line0, line2);
  860.     register vector signed short line2B = vec_sub(line0, line2);
  861.     register vector signed short line1B = vec_add(line1, line3);
  862.     register vector signed short line3B = vec_sub(line1, line3);
  863.     register vector signed short line4B = vec_add(line4, line6);
  864.     register vector signed short line6B = vec_sub(line4, line6);
  865.     register vector signed short line5B = vec_add(line5, line7);
  866.     register vector signed short line7B = vec_sub(line5, line7);
  867.  
  868.     register vector signed short line0C = vec_add(line0B, line4B);
  869.     register vector signed short line4C = vec_sub(line0B, line4B);
  870.     register vector signed short line1C = vec_add(line1B, line5B);
  871.     register vector signed short line5C = vec_sub(line1B, line5B);
  872.     register vector signed short line2C = vec_add(line2B, line6B);
  873.     register vector signed short line6C = vec_sub(line2B, line6B);
  874.     register vector signed short line3C = vec_add(line3B, line7B);
  875.     register vector signed short line7C = vec_sub(line3B, line7B);
  876.  
  877.     vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  878.     vsum = vec_sum4s(vec_abs(line1C), vsum);
  879.     vsum = vec_sum4s(vec_abs(line2C), vsum);
  880.     vsum = vec_sum4s(vec_abs(line3C), vsum);
  881.     vsum = vec_sum4s(vec_abs(line4C), vsum);
  882.     vsum = vec_sum4s(vec_abs(line5C), vsum);
  883.     vsum = vec_sum4s(vec_abs(line6C), vsum);
  884.     vsum = vec_sum4s(vec_abs(line7C), vsum);
  885.  
  886.     line0S = vec_add(temp0S, temp1S);
  887.     line1S = vec_sub(temp0S, temp1S);
  888.     line2S = vec_add(temp2S, temp3S);
  889.     line3S = vec_sub(temp2S, temp3S);
  890.     line4S = vec_add(temp4S, temp5S);
  891.     line5S = vec_sub(temp4S, temp5S);
  892.     line6S = vec_add(temp6S, temp7S);
  893.     line7S = vec_sub(temp6S, temp7S);
  894.  
  895.     line0BS = vec_add(line0S, line2S);
  896.     line2BS = vec_sub(line0S, line2S);
  897.     line1BS = vec_add(line1S, line3S);
  898.     line3BS = vec_sub(line1S, line3S);
  899.     line4BS = vec_add(line4S, line6S);
  900.     line6BS = vec_sub(line4S, line6S);
  901.     line5BS = vec_add(line5S, line7S);
  902.     line7BS = vec_sub(line5S, line7S);
  903.  
  904.     line0CS = vec_add(line0BS, line4BS);
  905.     line4CS = vec_sub(line0BS, line4BS);
  906.     line1CS = vec_add(line1BS, line5BS);
  907.     line5CS = vec_sub(line1BS, line5BS);
  908.     line2CS = vec_add(line2BS, line6BS);
  909.     line6CS = vec_sub(line2BS, line6BS);
  910.     line3CS = vec_add(line3BS, line7BS);
  911.     line7CS = vec_sub(line3BS, line7BS);
  912.  
  913.     vsum = vec_sum4s(vec_abs(line0CS), vsum);
  914.     vsum = vec_sum4s(vec_abs(line1CS), vsum);
  915.     vsum = vec_sum4s(vec_abs(line2CS), vsum);
  916.     vsum = vec_sum4s(vec_abs(line3CS), vsum);
  917.     vsum = vec_sum4s(vec_abs(line4CS), vsum);
  918.     vsum = vec_sum4s(vec_abs(line5CS), vsum);
  919.     vsum = vec_sum4s(vec_abs(line6CS), vsum);
  920.     vsum = vec_sum4s(vec_abs(line7CS), vsum);
  921.     vsum = vec_sums(vsum, (vector signed int)vzero);
  922.     vsum = vec_splat(vsum, 3);
  923.     vec_ste(vsum, 0, &sum);
  924.     }
  925.     return sum;
  926. }
  927.  
  928. static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  929.     int score;
  930.     score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  931.     if (h==16) {
  932.         dst += 8*stride;
  933.         src += 8*stride;
  934.         score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  935.     }
  936.     return score;
  937. }
  938.  
  939. av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx)
  940. {
  941.     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
  942.  
  943.     c->pix_abs[0][1] = sad16_x2_altivec;
  944.     c->pix_abs[0][2] = sad16_y2_altivec;
  945.     c->pix_abs[0][3] = sad16_xy2_altivec;
  946.     c->pix_abs[0][0] = sad16_altivec;
  947.     c->pix_abs[1][0] = sad8_altivec;
  948.     c->sad[0]= sad16_altivec;
  949.     c->sad[1]= sad8_altivec;
  950.     c->pix_norm1 = pix_norm1_altivec;
  951.     c->sse[1]= sse8_altivec;
  952.     c->sse[0]= sse16_altivec;
  953.     c->pix_sum = pix_sum_altivec;
  954.     c->diff_pixels = diff_pixels_altivec;
  955.     c->add_bytes= add_bytes_altivec;
  956.     if (!high_bit_depth) {
  957.     c->get_pixels = get_pixels_altivec;
  958.     c->clear_block = clear_block_altivec;
  959.     }
  960.  
  961.     c->hadamard8_diff[0] = hadamard8_diff16_altivec;
  962.     c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
  963. }
  964.