Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright (c) 2002 Brian Foley
  3.  * Copyright (c) 2002 Dieter Shirley
  4.  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  5.  *
  6.  * This file is part of FFmpeg.
  7.  *
  8.  * FFmpeg is free software; you can redistribute it and/or
  9.  * modify it under the terms of the GNU Lesser General Public
  10.  * License as published by the Free Software Foundation; either
  11.  * version 2.1 of the License, or (at your option) any later version.
  12.  *
  13.  * FFmpeg is distributed in the hope that it will be useful,
  14.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16.  * Lesser General Public License for more details.
  17.  *
  18.  * You should have received a copy of the GNU Lesser General Public
  19.  * License along with FFmpeg; if not, write to the Free Software
  20.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21.  */
  22.  
  23. #include "config.h"
  24. #if HAVE_ALTIVEC_H
  25. #include <altivec.h>
  26. #endif
  27. #include <string.h>
  28.  
  29. #include "libavutil/attributes.h"
  30. #include "libavutil/cpu.h"
  31. #include "libavutil/mem.h"
  32. #include "libavutil/ppc/cpu.h"
  33. #include "libavutil/ppc/types_altivec.h"
  34. #include "libavcodec/blockdsp.h"
  35.  
  36. /* ***** WARNING ***** WARNING ***** WARNING ***** */
  37. /*
  38.  * clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with
  39.  * a cache line size not equal to 32 bytes. Fortunately all processors used
  40.  * by Apple up to at least the 7450 (AKA second generation G4) use 32-byte
  41.  * cache lines. This is due to the use of the 'dcbz' instruction. It simply
  42.  * clears a single cache line to zero, so you need to know the cache line
  43.  * size to use it! It's absurd, but it's fast...
  44.  *
  45.  * update 24/06/2003: Apple released the G5 yesterday, with a PPC970.
  46.  * cache line size: 128 bytes. Oups.
  47.  * The semantics of dcbz was changed, it always clears 32 bytes. So the function
  48.  * below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
  49.  * which is defined to clear a cache line (as dcbz before). So we can still
  50.  * distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
  51.  *
  52.  * see <http://developer.apple.com/technotes/tn/tn2087.html>
  53.  * and <http://developer.apple.com/technotes/tn/tn2086.html>
  54.  */
  55. static void clear_blocks_dcbz32_ppc(int16_t *blocks)
  56. {
  57.     register int misal = (unsigned long) blocks & 0x00000010, i = 0;
  58.  
  59.     if (misal) {
  60.         ((unsigned long *) blocks)[0] = 0L;
  61.         ((unsigned long *) blocks)[1] = 0L;
  62.         ((unsigned long *) blocks)[2] = 0L;
  63.         ((unsigned long *) blocks)[3] = 0L;
  64.         i += 16;
  65.     }
  66.     for (; i < sizeof(int16_t) * 6 * 64 - 31; i += 32)
  67.         __asm__ volatile ("dcbz %0,%1" :: "b" (blocks), "r" (i) : "memory");
  68.     if (misal) {
  69.         ((unsigned long *) blocks)[188] = 0L;
  70.         ((unsigned long *) blocks)[189] = 0L;
  71.         ((unsigned long *) blocks)[190] = 0L;
  72.         ((unsigned long *) blocks)[191] = 0L;
  73.         i += 16;
  74.     }
  75. }
  76.  
  77. /* Same as above, when dcbzl clears a whole 128 bytes cache line
  78.  * i.e. the PPC970 AKA G5. */
  79. static void clear_blocks_dcbz128_ppc(int16_t *blocks)
  80. {
  81. #if HAVE_DCBZL
  82.     register int misal = (unsigned long) blocks & 0x0000007f, i = 0;
  83.  
  84.     if (misal) {
  85.         /* We could probably also optimize this case,
  86.          * but there's not much point as the machines
  87.          * aren't available yet (2003-06-26). */
  88.         memset(blocks, 0, sizeof(int16_t) * 6 * 64);
  89.     } else {
  90.         for (; i < sizeof(int16_t) * 6 * 64; i += 128)
  91.             __asm__ volatile ("dcbzl %0,%1" :: "b" (blocks), "r" (i) : "memory");
  92.     }
  93. #else
  94.     memset(blocks, 0, sizeof(int16_t) * 6 * 64);
  95. #endif
  96. }
  97.  
  98. /* Check dcbz report how many bytes are set to 0 by dcbz. */
  99. /* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect
  100.  * (Apple "fixed" dcbz). Unfortunately this cannot be used unless the
  101.  * assembler knows about dcbzl ... */
  102. static long check_dcbzl_effect(void)
  103. {
  104.     long count = 0;
  105. #if HAVE_DCBZL
  106.     register char *fakedata = av_malloc(1024);
  107.     register char *fakedata_middle;
  108.     register long zero = 0, i = 0;
  109.  
  110.     if (!fakedata)
  111.         return 0L;
  112.  
  113.     fakedata_middle = fakedata + 512;
  114.  
  115.     memset(fakedata, 0xFF, 1024);
  116.  
  117.     /* Below the constraint "b" seems to mean "address base register"
  118.      * in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */
  119.     __asm__ volatile ("dcbzl %0, %1" :: "b" (fakedata_middle), "r" (zero));
  120.  
  121.     for (i = 0; i < 1024; i++)
  122.         if (fakedata[i] == (char) 0)
  123.             count++;
  124.  
  125.     av_free(fakedata);
  126. #endif
  127.  
  128.     return count;
  129. }
  130.  
  131. #if HAVE_ALTIVEC
  132. static void clear_block_altivec(int16_t *block)
  133. {
  134.     LOAD_ZERO;
  135.     vec_st(zero_s16v,   0, block);
  136.     vec_st(zero_s16v,  16, block);
  137.     vec_st(zero_s16v,  32, block);
  138.     vec_st(zero_s16v,  48, block);
  139.     vec_st(zero_s16v,  64, block);
  140.     vec_st(zero_s16v,  80, block);
  141.     vec_st(zero_s16v,  96, block);
  142.     vec_st(zero_s16v, 112, block);
  143. }
  144. #endif /* HAVE_ALTIVEC */
  145.  
  146. av_cold void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth)
  147. {
  148.     // common optimizations whether AltiVec is available or not
  149.     if (!high_bit_depth) {
  150.         switch (check_dcbzl_effect()) {
  151.         case 32:
  152.             c->clear_blocks = clear_blocks_dcbz32_ppc;
  153.             break;
  154.         case 128:
  155.             c->clear_blocks = clear_blocks_dcbz128_ppc;
  156.             break;
  157.         default:
  158.             break;
  159.         }
  160.     }
  161.  
  162. #if HAVE_ALTIVEC
  163.     if (!PPC_ALTIVEC(av_get_cpu_flags()))
  164.         return;
  165.  
  166.     if (!high_bit_depth)
  167.         c->clear_block = clear_block_altivec;
  168. #endif /* HAVE_ALTIVEC */
  169. }
  170.