Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
/*
2
 * Copyright (c) 2004 Romain Dolbeau 
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20
 
21
#include "libavutil/mem.h"
22
 
23
#ifdef DEBUG
24
#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25
#else
26
#define ASSERT_ALIGNED(ptr) ;
27
#endif
28
 
29
/* this code assume stride % 16 == 0 */
30
#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
31
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
32
    register int i;
33
 
34
    LOAD_ZERO;
35
    const vec_u8 permM2 = vec_lvsl(-2, src);
36
    const vec_u8 permM1 = vec_lvsl(-1, src);
37
    const vec_u8 permP0 = vec_lvsl(+0, src);
38
    const vec_u8 permP1 = vec_lvsl(+1, src);
39
    const vec_u8 permP2 = vec_lvsl(+2, src);
40
    const vec_u8 permP3 = vec_lvsl(+3, src);
41
    const vec_s16 v5ss = vec_splat_s16(5);
42
    const vec_u16 v5us = vec_splat_u16(5);
43
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
44
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
45
 
46
    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
47
 
48
    register int align = ((((unsigned long)src) - 2) % 16);
49
 
50
    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
51
              srcP2A, srcP2B, srcP3A, srcP3B,
52
              srcM1A, srcM1B, srcM2A, srcM2B,
53
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
54
              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
55
              psumA, psumB, sumA, sumB;
56
 
57
    vec_u8 sum, fsum;
58
 
59
    for (i = 0 ; i < 16 ; i ++) {
60
        vec_u8 srcR1 = vec_ld(-2, src);
61
        vec_u8 srcR2 = vec_ld(14, src);
62
 
63
        switch (align) {
64
        default: {
65
            srcM2 = vec_perm(srcR1, srcR2, permM2);
66
            srcM1 = vec_perm(srcR1, srcR2, permM1);
67
            srcP0 = vec_perm(srcR1, srcR2, permP0);
68
            srcP1 = vec_perm(srcR1, srcR2, permP1);
69
            srcP2 = vec_perm(srcR1, srcR2, permP2);
70
            srcP3 = vec_perm(srcR1, srcR2, permP3);
71
        } break;
72
        case 11: {
73
            srcM2 = vec_perm(srcR1, srcR2, permM2);
74
            srcM1 = vec_perm(srcR1, srcR2, permM1);
75
            srcP0 = vec_perm(srcR1, srcR2, permP0);
76
            srcP1 = vec_perm(srcR1, srcR2, permP1);
77
            srcP2 = vec_perm(srcR1, srcR2, permP2);
78
            srcP3 = srcR2;
79
        } break;
80
        case 12: {
81
            vec_u8 srcR3 = vec_ld(30, src);
82
            srcM2 = vec_perm(srcR1, srcR2, permM2);
83
            srcM1 = vec_perm(srcR1, srcR2, permM1);
84
            srcP0 = vec_perm(srcR1, srcR2, permP0);
85
            srcP1 = vec_perm(srcR1, srcR2, permP1);
86
            srcP2 = srcR2;
87
            srcP3 = vec_perm(srcR2, srcR3, permP3);
88
        } break;
89
        case 13: {
90
            vec_u8 srcR3 = vec_ld(30, src);
91
            srcM2 = vec_perm(srcR1, srcR2, permM2);
92
            srcM1 = vec_perm(srcR1, srcR2, permM1);
93
            srcP0 = vec_perm(srcR1, srcR2, permP0);
94
            srcP1 = srcR2;
95
            srcP2 = vec_perm(srcR2, srcR3, permP2);
96
            srcP3 = vec_perm(srcR2, srcR3, permP3);
97
        } break;
98
        case 14: {
99
            vec_u8 srcR3 = vec_ld(30, src);
100
            srcM2 = vec_perm(srcR1, srcR2, permM2);
101
            srcM1 = vec_perm(srcR1, srcR2, permM1);
102
            srcP0 = srcR2;
103
            srcP1 = vec_perm(srcR2, srcR3, permP1);
104
            srcP2 = vec_perm(srcR2, srcR3, permP2);
105
            srcP3 = vec_perm(srcR2, srcR3, permP3);
106
        } break;
107
        case 15: {
108
            vec_u8 srcR3 = vec_ld(30, src);
109
            srcM2 = vec_perm(srcR1, srcR2, permM2);
110
            srcM1 = srcR2;
111
            srcP0 = vec_perm(srcR2, srcR3, permP0);
112
            srcP1 = vec_perm(srcR2, srcR3, permP1);
113
            srcP2 = vec_perm(srcR2, srcR3, permP2);
114
            srcP3 = vec_perm(srcR2, srcR3, permP3);
115
        } break;
116
        }
117
 
118
        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
119
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
120
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
121
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
122
 
123
        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
124
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
125
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
126
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
127
 
128
        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
129
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
130
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
131
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
132
 
133
        sum1A = vec_adds(srcP0A, srcP1A);
134
        sum1B = vec_adds(srcP0B, srcP1B);
135
        sum2A = vec_adds(srcM1A, srcP2A);
136
        sum2B = vec_adds(srcM1B, srcP2B);
137
        sum3A = vec_adds(srcM2A, srcP3A);
138
        sum3B = vec_adds(srcM2B, srcP3B);
139
 
140
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
141
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
142
 
143
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
144
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
145
 
146
        pp3A = vec_add(sum3A, pp1A);
147
        pp3B = vec_add(sum3B, pp1B);
148
 
149
        psumA = vec_sub(pp3A, pp2A);
150
        psumB = vec_sub(pp3B, pp2B);
151
 
152
        sumA = vec_sra(psumA, v5us);
153
        sumB = vec_sra(psumB, v5us);
154
 
155
        sum = vec_packsu(sumA, sumB);
156
 
157
        ASSERT_ALIGNED(dst);
158
 
159
        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
160
 
161
        vec_st(fsum, 0, dst);
162
 
163
        src += srcStride;
164
        dst += dstStride;
165
    }
166
}
167
#endif
168
 
169
/* this code assume stride % 16 == 0 */
170
#ifdef PREFIX_h264_qpel16_v_lowpass_altivec
171
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
172
    register int i;
173
 
174
    LOAD_ZERO;
175
    const vec_u8 perm = vec_lvsl(0, src);
176
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
177
    const vec_u16 v5us = vec_splat_u16(5);
178
    const vec_s16 v5ss = vec_splat_s16(5);
179
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
180
 
181
    uint8_t *srcbis = src - (srcStride * 2);
182
 
183
    const vec_u8 srcM2a = vec_ld(0, srcbis);
184
    const vec_u8 srcM2b = vec_ld(16, srcbis);
185
    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
186
    //srcbis += srcStride;
187
    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
188
    const vec_u8 srcM1b = vec_ld(16, srcbis);
189
    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
190
    //srcbis += srcStride;
191
    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
192
    const vec_u8 srcP0b = vec_ld(16, srcbis);
193
    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
194
    //srcbis += srcStride;
195
    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
196
    const vec_u8 srcP1b = vec_ld(16, srcbis);
197
    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
198
    //srcbis += srcStride;
199
    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
200
    const vec_u8 srcP2b = vec_ld(16, srcbis);
201
    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
202
    //srcbis += srcStride;
203
 
204
    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
205
    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
206
    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
207
    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
208
    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
209
    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
210
    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
211
    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
212
    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
213
    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
214
 
215
    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
216
              psumA, psumB, sumA, sumB,
217
              srcP3ssA, srcP3ssB,
218
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
219
 
220
    vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
221
 
222
    for (i = 0 ; i < 16 ; i++) {
223
        srcP3a = vec_ld(0, srcbis += srcStride);
224
        srcP3b = vec_ld(16, srcbis);
225
        srcP3 = vec_perm(srcP3a, srcP3b, perm);
226
        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
227
        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
228
        //srcbis += srcStride;
229
 
230
        sum1A = vec_adds(srcP0ssA, srcP1ssA);
231
        sum1B = vec_adds(srcP0ssB, srcP1ssB);
232
        sum2A = vec_adds(srcM1ssA, srcP2ssA);
233
        sum2B = vec_adds(srcM1ssB, srcP2ssB);
234
        sum3A = vec_adds(srcM2ssA, srcP3ssA);
235
        sum3B = vec_adds(srcM2ssB, srcP3ssB);
236
 
237
        srcM2ssA = srcM1ssA;
238
        srcM2ssB = srcM1ssB;
239
        srcM1ssA = srcP0ssA;
240
        srcM1ssB = srcP0ssB;
241
        srcP0ssA = srcP1ssA;
242
        srcP0ssB = srcP1ssB;
243
        srcP1ssA = srcP2ssA;
244
        srcP1ssB = srcP2ssB;
245
        srcP2ssA = srcP3ssA;
246
        srcP2ssB = srcP3ssB;
247
 
248
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
249
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
250
 
251
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
252
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
253
 
254
        pp3A = vec_add(sum3A, pp1A);
255
        pp3B = vec_add(sum3B, pp1B);
256
 
257
        psumA = vec_sub(pp3A, pp2A);
258
        psumB = vec_sub(pp3B, pp2B);
259
 
260
        sumA = vec_sra(psumA, v5us);
261
        sumB = vec_sra(psumB, v5us);
262
 
263
        sum = vec_packsu(sumA, sumB);
264
 
265
        ASSERT_ALIGNED(dst);
266
 
267
        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
268
 
269
        vec_st(fsum, 0, dst);
270
 
271
        dst += dstStride;
272
    }
273
}
274
#endif
275
 
276
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
277
#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
278
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
279
    register int i;
280
    LOAD_ZERO;
281
    const vec_u8 permM2 = vec_lvsl(-2, src);
282
    const vec_u8 permM1 = vec_lvsl(-1, src);
283
    const vec_u8 permP0 = vec_lvsl(+0, src);
284
    const vec_u8 permP1 = vec_lvsl(+1, src);
285
    const vec_u8 permP2 = vec_lvsl(+2, src);
286
    const vec_u8 permP3 = vec_lvsl(+3, src);
287
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
288
    const vec_u32 v10ui = vec_splat_u32(10);
289
    const vec_s16 v5ss = vec_splat_s16(5);
290
    const vec_s16 v1ss = vec_splat_s16(1);
291
    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
292
    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
293
 
294
    register int align = ((((unsigned long)src) - 2) % 16);
295
 
296
    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
297
              srcP2A, srcP2B, srcP3A, srcP3B,
298
              srcM1A, srcM1B, srcM2A, srcM2B,
299
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
300
              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
301
 
302
    const vec_u8 mperm = (const vec_u8)
303
        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
304
         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
305
    int16_t *tmpbis = tmp;
306
 
307
    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
308
              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
309
              tmpP2ssA, tmpP2ssB;
310
 
311
    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
312
              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
313
              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
314
              ssumAe, ssumAo, ssumBe, ssumBo;
315
    vec_u8 fsum, sumv, sum;
316
    vec_s16 ssume, ssumo;
317
 
318
    src -= (2 * srcStride);
319
    for (i = 0 ; i < 21 ; i ++) {
320
        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
321
        vec_u8 srcR1 = vec_ld(-2, src);
322
        vec_u8 srcR2 = vec_ld(14, src);
323
 
324
        switch (align) {
325
        default: {
326
            srcM2 = vec_perm(srcR1, srcR2, permM2);
327
            srcM1 = vec_perm(srcR1, srcR2, permM1);
328
            srcP0 = vec_perm(srcR1, srcR2, permP0);
329
            srcP1 = vec_perm(srcR1, srcR2, permP1);
330
            srcP2 = vec_perm(srcR1, srcR2, permP2);
331
            srcP3 = vec_perm(srcR1, srcR2, permP3);
332
        } break;
333
        case 11: {
334
            srcM2 = vec_perm(srcR1, srcR2, permM2);
335
            srcM1 = vec_perm(srcR1, srcR2, permM1);
336
            srcP0 = vec_perm(srcR1, srcR2, permP0);
337
            srcP1 = vec_perm(srcR1, srcR2, permP1);
338
            srcP2 = vec_perm(srcR1, srcR2, permP2);
339
            srcP3 = srcR2;
340
        } break;
341
        case 12: {
342
            vec_u8 srcR3 = vec_ld(30, src);
343
            srcM2 = vec_perm(srcR1, srcR2, permM2);
344
            srcM1 = vec_perm(srcR1, srcR2, permM1);
345
            srcP0 = vec_perm(srcR1, srcR2, permP0);
346
            srcP1 = vec_perm(srcR1, srcR2, permP1);
347
            srcP2 = srcR2;
348
            srcP3 = vec_perm(srcR2, srcR3, permP3);
349
        } break;
350
        case 13: {
351
            vec_u8 srcR3 = vec_ld(30, src);
352
            srcM2 = vec_perm(srcR1, srcR2, permM2);
353
            srcM1 = vec_perm(srcR1, srcR2, permM1);
354
            srcP0 = vec_perm(srcR1, srcR2, permP0);
355
            srcP1 = srcR2;
356
            srcP2 = vec_perm(srcR2, srcR3, permP2);
357
            srcP3 = vec_perm(srcR2, srcR3, permP3);
358
        } break;
359
        case 14: {
360
            vec_u8 srcR3 = vec_ld(30, src);
361
            srcM2 = vec_perm(srcR1, srcR2, permM2);
362
            srcM1 = vec_perm(srcR1, srcR2, permM1);
363
            srcP0 = srcR2;
364
            srcP1 = vec_perm(srcR2, srcR3, permP1);
365
            srcP2 = vec_perm(srcR2, srcR3, permP2);
366
            srcP3 = vec_perm(srcR2, srcR3, permP3);
367
        } break;
368
        case 15: {
369
            vec_u8 srcR3 = vec_ld(30, src);
370
            srcM2 = vec_perm(srcR1, srcR2, permM2);
371
            srcM1 = srcR2;
372
            srcP0 = vec_perm(srcR2, srcR3, permP0);
373
            srcP1 = vec_perm(srcR2, srcR3, permP1);
374
            srcP2 = vec_perm(srcR2, srcR3, permP2);
375
            srcP3 = vec_perm(srcR2, srcR3, permP3);
376
        } break;
377
        }
378
 
379
        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
380
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
381
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
382
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
383
 
384
        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
385
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
386
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
387
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
388
 
389
        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
390
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
391
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
392
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
393
 
394
        sum1A = vec_adds(srcP0A, srcP1A);
395
        sum1B = vec_adds(srcP0B, srcP1B);
396
        sum2A = vec_adds(srcM1A, srcP2A);
397
        sum2B = vec_adds(srcM1B, srcP2B);
398
        sum3A = vec_adds(srcM2A, srcP3A);
399
        sum3B = vec_adds(srcM2B, srcP3B);
400
 
401
        pp1A = vec_mladd(sum1A, v20ss, sum3A);
402
        pp1B = vec_mladd(sum1B, v20ss, sum3B);
403
 
404
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
405
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
406
 
407
        psumA = vec_sub(pp1A, pp2A);
408
        psumB = vec_sub(pp1B, pp2B);
409
 
410
        vec_st(psumA, 0, tmp);
411
        vec_st(psumB, 16, tmp);
412
 
413
        src += srcStride;
414
        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
415
    }
416
 
417
    tmpM2ssA = vec_ld(0, tmpbis);
418
    tmpM2ssB = vec_ld(16, tmpbis);
419
    tmpbis += tmpStride;
420
    tmpM1ssA = vec_ld(0, tmpbis);
421
    tmpM1ssB = vec_ld(16, tmpbis);
422
    tmpbis += tmpStride;
423
    tmpP0ssA = vec_ld(0, tmpbis);
424
    tmpP0ssB = vec_ld(16, tmpbis);
425
    tmpbis += tmpStride;
426
    tmpP1ssA = vec_ld(0, tmpbis);
427
    tmpP1ssB = vec_ld(16, tmpbis);
428
    tmpbis += tmpStride;
429
    tmpP2ssA = vec_ld(0, tmpbis);
430
    tmpP2ssB = vec_ld(16, tmpbis);
431
    tmpbis += tmpStride;
432
 
433
    for (i = 0 ; i < 16 ; i++) {
434
        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
435
        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
436
 
437
        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
438
        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
439
        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
440
        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
441
        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
442
        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
443
 
444
        tmpbis += tmpStride;
445
 
446
        tmpM2ssA = tmpM1ssA;
447
        tmpM2ssB = tmpM1ssB;
448
        tmpM1ssA = tmpP0ssA;
449
        tmpM1ssB = tmpP0ssB;
450
        tmpP0ssA = tmpP1ssA;
451
        tmpP0ssB = tmpP1ssB;
452
        tmpP1ssA = tmpP2ssA;
453
        tmpP1ssB = tmpP2ssB;
454
        tmpP2ssA = tmpP3ssA;
455
        tmpP2ssB = tmpP3ssB;
456
 
457
        pp1Ae = vec_mule(sum1A, v20ss);
458
        pp1Ao = vec_mulo(sum1A, v20ss);
459
        pp1Be = vec_mule(sum1B, v20ss);
460
        pp1Bo = vec_mulo(sum1B, v20ss);
461
 
462
        pp2Ae = vec_mule(sum2A, v5ss);
463
        pp2Ao = vec_mulo(sum2A, v5ss);
464
        pp2Be = vec_mule(sum2B, v5ss);
465
        pp2Bo = vec_mulo(sum2B, v5ss);
466
 
467
        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
468
        pp3Ao = vec_mulo(sum3A, v1ss);
469
        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
470
        pp3Bo = vec_mulo(sum3B, v1ss);
471
 
472
        pp1cAe = vec_add(pp1Ae, v512si);
473
        pp1cAo = vec_add(pp1Ao, v512si);
474
        pp1cBe = vec_add(pp1Be, v512si);
475
        pp1cBo = vec_add(pp1Bo, v512si);
476
 
477
        pp32Ae = vec_sub(pp3Ae, pp2Ae);
478
        pp32Ao = vec_sub(pp3Ao, pp2Ao);
479
        pp32Be = vec_sub(pp3Be, pp2Be);
480
        pp32Bo = vec_sub(pp3Bo, pp2Bo);
481
 
482
        sumAe = vec_add(pp1cAe, pp32Ae);
483
        sumAo = vec_add(pp1cAo, pp32Ao);
484
        sumBe = vec_add(pp1cBe, pp32Be);
485
        sumBo = vec_add(pp1cBo, pp32Bo);
486
 
487
        ssumAe = vec_sra(sumAe, v10ui);
488
        ssumAo = vec_sra(sumAo, v10ui);
489
        ssumBe = vec_sra(sumBe, v10ui);
490
        ssumBo = vec_sra(sumBo, v10ui);
491
 
492
        ssume = vec_packs(ssumAe, ssumBe);
493
        ssumo = vec_packs(ssumAo, ssumBo);
494
 
495
        sumv = vec_packsu(ssume, ssumo);
496
        sum = vec_perm(sumv, sumv, mperm);
497
 
498
        ASSERT_ALIGNED(dst);
499
 
500
        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
501
 
502
        vec_st(fsum, 0, dst);
503
 
504
        dst += dstStride;
505
    }
506
}
507
#endif