Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6147 serge 1
/*
2
 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20
 
21
#include "libavutil/mips/generic_macros_msa.h"
22
#include "libavcodec/mips/hevcdsp_mips.h"
23
#include "libavcodec/mips/hevc_macros_msa.h"
24
 
25
#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)  \
26
{                                                                     \
27
    ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                    \
28
    SRARI_H2_SH(out0, out1, rnd_val);                                 \
29
    CLIP_SH2_0_255(out0, out1);                                       \
30
}
31
 
32
#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3,                      \
33
                          vec0, vec1, vec2, vec3, rnd_val,         \
34
                          out0, out1, out2, out3)                  \
35
{                                                                  \
36
    HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1);  \
37
    HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3);  \
38
}
39
 
40
static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
41
                                int32_t src_stride,
42
                                int16_t *src1_ptr,
43
                                int32_t src2_stride,
44
                                uint8_t *dst,
45
                                int32_t dst_stride,
46
                                int32_t height)
47
{
48
    v16i8 zero = { 0 };
49
 
50
    if (2 == height) {
51
        v16i8 src0, src1;
52
        v8i16 dst0, in0, in1;
53
 
54
        LD_SB2(src0_ptr, src_stride, src0, src1);
55
        LD_SH2(src1_ptr, src2_stride, in0, in1);
56
 
57
        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
58
        in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
59
 
60
        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
61
        dst0 <<= 6;
62
        dst0 += in0;
63
        dst0 = __msa_srari_h(dst0, 7);
64
        dst0 = CLIP_SH_0_255(dst0);
65
 
66
        dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
67
        ST4x2_UB(dst0, dst, dst_stride);
68
    } else if (4 == height) {
69
        v16i8 src0, src1, src2, src3;
70
        v8i16 dst0, dst1;
71
        v8i16 in0, in1, in2, in3;
72
 
73
        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
74
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
75
        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
76
        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
77
        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
78
 
79
        dst0 <<= 6;
80
        dst1 <<= 6;
81
        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
82
 
83
        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
84
        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
85
    } else if (0 == height % 8) {
86
        uint32_t loop_cnt;
87
        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
88
        v8i16 dst0, dst1, dst2, dst3;
89
        v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
90
 
91
        for (loop_cnt = (height >> 3); loop_cnt--;) {
92
            LD_SB8(src0_ptr, src_stride,
93
                   src0, src1, src2, src3, src4, src5, src6, src7);
94
            src0_ptr += (8 * src_stride);
95
 
96
            LD_SH8(src1_ptr, src2_stride,
97
                   in0, in1, in2, in3, in4, in5, in6, in7);
98
            src1_ptr += (8 * src2_stride);
99
 
100
            ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
101
            ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
102
 
103
            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
104
                       src0, src1, src2, src3);
105
            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
106
                       dst0, dst1, dst2, dst3);
107
 
108
            SLLI_4V(dst0, dst1, dst2, dst3, 6);
109
            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
110
                              dst0, dst1, dst2, dst3, 7,
111
                              dst0, dst1, dst2, dst3);
112
 
113
            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
114
            ST4x8_UB(dst0, dst1, dst, dst_stride);
115
            dst += (8 * dst_stride);
116
        }
117
    }
118
}
119
 
120
static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
121
                                int32_t src_stride,
122
                                int16_t *src1_ptr,
123
                                int32_t src2_stride,
124
                                uint8_t *dst,
125
                                int32_t dst_stride,
126
                                int32_t height)
127
{
128
    uint32_t loop_cnt;
129
    v16i8 zero = { 0 };
130
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
131
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
132
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
133
 
134
    for (loop_cnt = (height >> 3); loop_cnt--;) {
135
        LD_SB8(src0_ptr, src_stride,
136
               src0, src1, src2, src3, src4, src5, src6, src7);
137
        src0_ptr += (8 * src_stride);
138
        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
139
        src1_ptr += (8 * src2_stride);
140
        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
141
                   dst0, dst1, dst2, dst3);
142
        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
143
                   dst4, dst5, dst6, dst7);
144
 
145
        SLLI_4V(dst0, dst1, dst2, dst3, 6);
146
        SLLI_4V(dst4, dst5, dst6, dst7, 6);
147
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
148
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
149
 
150
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
151
        ST6x4_UB(dst0, dst1, dst, dst_stride);
152
        dst += (4 * dst_stride);
153
 
154
        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
155
                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
156
 
157
        PCKEV_B2_SH(dst5, dst4, dst7, dst6, dst4, dst5);
158
        ST6x4_UB(dst4, dst5, dst, dst_stride);
159
        dst += (4 * dst_stride);
160
    }
161
}
162
 
163
static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
164
                                int32_t src_stride,
165
                                int16_t *src1_ptr,
166
                                int32_t src2_stride,
167
                                uint8_t *dst,
168
                                int32_t dst_stride,
169
                                int32_t height)
170
{
171
    v16i8 zero = { 0 };
172
 
173
    if (2 == height) {
174
        v16i8 src0, src1;
175
        v8i16 in0, in1;
176
        v8i16 dst0, dst1;
177
 
178
        LD_SB2(src0_ptr, src_stride, src0, src1);
179
        LD_SH2(src1_ptr, src2_stride, in0, in1);
180
        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
181
 
182
        dst0 <<= 6;
183
        dst1 <<= 6;
184
        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
185
 
186
        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
187
        ST8x2_UB(dst0, dst, dst_stride);
188
    } else if (4 == height) {
189
        v16i8 src0, src1, src2, src3;
190
        v8i16 in0, in1, in2, in3;
191
        v8i16 dst0, dst1, dst2, dst3;
192
 
193
        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
194
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
195
        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
196
                   dst0, dst1, dst2, dst3);
197
 
198
        SLLI_4V(dst0, dst1, dst2, dst3, 6);
199
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
200
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
201
 
202
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
203
        ST8x4_UB(dst0, dst1, dst, dst_stride);
204
    } else if (6 == height) {
205
        v16i8 src0, src1, src2, src3, src4, src5;
206
        v8i16 in0, in1, in2, in3, in4, in5;
207
        v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
208
 
209
        LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
210
        LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
211
        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
212
                   dst0, dst1, dst2, dst3);
213
        ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
214
 
215
        SLLI_4V(dst0, dst1, dst2, dst3, 6);
216
        dst4 <<= 6;
217
        dst5 <<= 6;
218
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
219
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
220
        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
221
 
222
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
223
        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
224
        ST8x4_UB(dst0, dst1, dst, dst_stride);
225
        dst += (4 * dst_stride);
226
        ST8x2_UB(dst2, dst, dst_stride);
227
    } else if (0 == height % 8) {
228
        v16i8 src0, src1, src2, src3;
229
        v8i16 in0, in1, in2, in3;
230
        v8i16 dst0, dst1, dst2, dst3;
231
        uint32_t loop_cnt;
232
 
233
        for (loop_cnt = (height >> 3); loop_cnt--;) {
234
            LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
235
            src0_ptr += (4 * src_stride);
236
            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
237
            src1_ptr += (4 * src2_stride);
238
            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
239
                       dst0, dst1, dst2, dst3);
240
 
241
            SLLI_4V(dst0, dst1, dst2, dst3, 6);
242
            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
243
                              dst0, dst1, dst2, dst3, 7,
244
                              dst0, dst1, dst2, dst3);
245
 
246
            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
247
            ST8x4_UB(dst0, dst1, dst, dst_stride);
248
            dst += (4 * dst_stride);
249
 
250
            LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
251
            src0_ptr += (4 * src_stride);
252
            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
253
            src1_ptr += (4 * src2_stride);
254
            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
255
                       dst0, dst1, dst2, dst3);
256
 
257
            SLLI_4V(dst0, dst1, dst2, dst3, 6);
258
            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
259
                              dst0, dst1, dst2, dst3, 7,
260
                              dst0, dst1, dst2, dst3);
261
 
262
            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
263
            ST8x4_UB(dst0, dst1, dst, dst_stride);
264
            dst += (4 * dst_stride);
265
        }
266
    }
267
}
268
 
269
static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
270
                                 int32_t src_stride,
271
                                 int16_t *src1_ptr,
272
                                 int32_t src2_stride,
273
                                 uint8_t *dst,
274
                                 int32_t dst_stride,
275
                                 int32_t height)
276
{
277
    uint32_t loop_cnt;
278
    v16i8 src0, src1, src2, src3;
279
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
280
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
281
    v16i8 zero = { 0 };
282
 
283
    for (loop_cnt = (16 >> 2); loop_cnt--;) {
284
        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
285
        src0_ptr += (4 * src_stride);
286
 
287
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
288
        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
289
        src1_ptr += (4 * src2_stride);
290
        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
291
        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
292
                   dst0, dst1, dst2, dst3);
293
 
294
        SLLI_4V(dst0, dst1, dst2, dst3, 6);
295
        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
296
        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
297
        dst4 <<= 6;
298
        dst5 <<= 6;
299
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
300
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
301
        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
302
 
303
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
304
        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
305
        ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
306
        dst += (4 * dst_stride);
307
    }
308
}
309
 
310
static void hevc_bi_copy_16multx4mult_msa(uint8_t *src0_ptr,
311
                                          int32_t src_stride,
312
                                          int16_t *src1_ptr,
313
                                          int32_t src2_stride,
314
                                          uint8_t *dst,
315
                                          int32_t dst_stride,
316
                                          int32_t height,
317
                                          int32_t width)
318
{
319
    uint32_t loop_cnt;
320
    uint32_t cnt;
321
    uint8_t *src0_ptr_tmp;
322
    int16_t *src1_ptr_tmp;
323
    uint8_t *dst_tmp;
324
    v16i8 zero = { 0 };
325
 
326
    for (cnt = (width >> 4); cnt--;) {
327
        src0_ptr_tmp = src0_ptr;
328
        src1_ptr_tmp = src1_ptr;
329
        dst_tmp = dst;
330
 
331
        for (loop_cnt = (height >> 2); loop_cnt--;) {
332
            v16i8 src0, src1, src2, src3;
333
            v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
334
            v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
335
            v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
336
 
337
            LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3);
338
            src0_ptr_tmp += (4 * src_stride);
339
            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
340
            LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7);
341
            src1_ptr_tmp += (4 * src2_stride);
342
 
343
            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
344
                       dst0_r, dst1_r, dst2_r, dst3_r);
345
            ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
346
                       dst0_l, dst1_l, dst2_l, dst3_l);
347
 
348
            SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
349
            SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
350
            HEVC_BI_RND_CLIP4(in0, in1, in4, in5,
351
                              dst0_r, dst1_r, dst0_l, dst1_l, 7,
352
                              dst0_r, dst1_r, dst0_l, dst1_l);
353
 
354
            PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
355
            ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
356
            dst_tmp += (2 * dst_stride);
357
 
358
            HEVC_BI_RND_CLIP4(in2, in3, in6, in7,
359
                              dst2_r, dst3_r, dst2_l, dst3_l, 7,
360
                              dst2_r, dst3_r, dst2_l, dst3_l);
361
 
362
            PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
363
            ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
364
            dst_tmp += (2 * dst_stride);
365
        }
366
 
367
        src0_ptr += 16;
368
        src1_ptr += 16;
369
        dst += 16;
370
    }
371
}
372
 
373
static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
374
                                 int32_t src_stride,
375
                                 int16_t *src1_ptr,
376
                                 int32_t src2_stride,
377
                                 uint8_t *dst,
378
                                 int32_t dst_stride,
379
                                 int32_t height)
380
{
381
    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
382
                                  dst, dst_stride, height, 16);
383
}
384
 
385
static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
386
                                 int32_t src_stride,
387
                                 int16_t *src1_ptr,
388
                                 int32_t src2_stride,
389
                                 uint8_t *dst,
390
                                 int32_t dst_stride,
391
                                 int32_t height)
392
{
393
    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
394
                                  dst, dst_stride, height, 16);
395
 
396
    hevc_bi_copy_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
397
                        dst + 16, dst_stride, height);
398
}
399
 
400
static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
401
                                 int32_t src_stride,
402
                                 int16_t *src1_ptr,
403
                                 int32_t src2_stride,
404
                                 uint8_t *dst,
405
                                 int32_t dst_stride,
406
                                 int32_t height)
407
{
408
    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
409
                                  dst, dst_stride, height, 32);
410
}
411
 
412
static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
413
                                 int32_t src_stride,
414
                                 int16_t *src1_ptr,
415
                                 int32_t src2_stride,
416
                                 uint8_t *dst,
417
                                 int32_t dst_stride,
418
                                 int32_t height)
419
{
420
    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
421
                                  dst, dst_stride, height, 48);
422
}
423
 
424
static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
425
                                 int32_t src_stride,
426
                                 int16_t *src1_ptr,
427
                                 int32_t src2_stride,
428
                                 uint8_t *dst,
429
                                 int32_t dst_stride,
430
                                 int32_t height)
431
{
432
    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
433
                                  dst, dst_stride, height, 64);
434
}
435
 
436
static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
437
                                 int32_t src_stride,
438
                                 int16_t *src1_ptr,
439
                                 int32_t src2_stride,
440
                                 uint8_t *dst,
441
                                 int32_t dst_stride,
442
                                 const int8_t *filter,
443
                                 int32_t height)
444
{
445
    uint32_t loop_cnt;
446
    v8i16 filt0, filt1, filt2, filt3;
447
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
448
    v16i8 mask1, mask2, mask3;
449
    v16i8 vec0, vec1, vec2, vec3;
450
    v8i16 dst0, dst1, dst2, dst3;
451
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
452
    v8i16 filter_vec, const_vec;
453
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
454
 
455
    src0_ptr -= 3;
456
 
457
    /* rearranging filter */
458
    filter_vec = LD_SH(filter);
459
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
460
 
461
    mask1 = mask0 + 2;
462
    mask2 = mask0 + 4;
463
    mask3 = mask0 + 6;
464
 
465
    const_vec = __msa_ldi_h(128);
466
    const_vec <<= 6;
467
 
468
    for (loop_cnt = (height >> 3); loop_cnt--;) {
469
        LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
470
               src4, src5, src6, src7);
471
        src0_ptr += (8 * src_stride);
472
        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
473
        src1_ptr += (8 * src2_stride);
474
 
475
        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
476
        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
477
        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
478
 
479
        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
480
                   vec0, vec1, vec2, vec3);
481
        dst0 = const_vec;
482
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
483
                     dst0, dst0, dst0, dst0);
484
        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
485
                   vec0, vec1, vec2, vec3);
486
        dst1 = const_vec;
487
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
488
                     dst1, dst1, dst1, dst1);
489
        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
490
                   vec0, vec1, vec2, vec3);
491
        dst2 = const_vec;
492
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
493
                     dst2, dst2, dst2, dst2);
494
        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
495
                   vec0, vec1, vec2, vec3);
496
        dst3 = const_vec;
497
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
498
                     dst3, dst3, dst3, dst3);
499
 
500
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
501
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
502
 
503
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
504
        ST4x8_UB(dst0, dst1, dst, dst_stride);
505
        dst += (8 * dst_stride);
506
    }
507
}
508
 
509
static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
510
                                 int32_t src_stride,
511
                                 int16_t *src1_ptr,
512
                                 int32_t src2_stride,
513
                                 uint8_t *dst,
514
                                 int32_t dst_stride,
515
                                 const int8_t *filter,
516
                                 int32_t height)
517
{
518
    uint32_t loop_cnt;
519
    v8i16 filt0, filt1, filt2, filt3;
520
    v16i8 src0, src1, src2, src3;
521
    v16i8 mask1, mask2, mask3;
522
    v16i8 vec0, vec1, vec2, vec3;
523
    v8i16 dst0, dst1, dst2, dst3;
524
    v8i16 in0, in1, in2, in3;
525
    v8i16 filter_vec, const_vec;
526
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
527
 
528
    src0_ptr -= 3;
529
 
530
    const_vec = __msa_ldi_h(128);
531
    const_vec <<= 6;
532
 
533
    filter_vec = LD_SH(filter);
534
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
535
 
536
    mask1 = mask0 + 2;
537
    mask2 = mask0 + 4;
538
    mask3 = mask0 + 6;
539
 
540
    for (loop_cnt = (height >> 2); loop_cnt--;) {
541
        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
542
        src0_ptr += (4 * src_stride);
543
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
544
        src1_ptr += (4 * src2_stride);
545
        XORI_B4_128_SB(src0, src1, src2, src3);
546
 
547
        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
548
                   vec0, vec1, vec2, vec3);
549
        dst0 = const_vec;
550
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
551
                     dst0, dst0, dst0, dst0);
552
        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
553
                   vec0, vec1, vec2, vec3);
554
        dst1 = const_vec;
555
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
556
                     dst1, dst1, dst1, dst1);
557
        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
558
                   vec0, vec1, vec2, vec3);
559
        dst2 = const_vec;
560
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
561
                     dst2, dst2, dst2, dst2);
562
        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
563
                   vec0, vec1, vec2, vec3);
564
        dst3 = const_vec;
565
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
566
                     dst3, dst3, dst3, dst3);
567
 
568
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
569
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
570
 
571
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
572
        ST8x4_UB(dst0, dst1, dst, dst_stride);
573
        dst += (4 * dst_stride);
574
    }
575
}
576
 
577
static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
578
                                  int32_t src_stride,
579
                                  int16_t *src1_ptr,
580
                                  int32_t src2_stride,
581
                                  uint8_t *dst,
582
                                  int32_t dst_stride,
583
                                  const int8_t *filter,
584
                                  int32_t height)
585
{
586
    hevc_hz_bi_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
587
                         dst, dst_stride, filter, height);
588
    hevc_hz_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
589
                         dst + 8, dst_stride, filter, height);
590
}
591
 
592
static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr,
593
                                  int32_t src_stride,
594
                                  int16_t *src1_ptr,
595
                                  int32_t src2_stride,
596
                                  uint8_t *dst,
597
                                  int32_t dst_stride,
598
                                  const int8_t *filter,
599
                                  int32_t height)
600
{
601
    uint32_t loop_cnt;
602
    v16i8 src0, src1, src2, src3;
603
    v8i16 filt0, filt1, filt2, filt3;
604
    v16i8 mask1, mask2, mask3;
605
    v16i8 vec0, vec1, vec2, vec3;
606
    v8i16 dst0, dst1, dst2, dst3;
607
    v8i16 in0, in1, in2, in3;
608
    v8i16 filter_vec, const_vec;
609
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
610
 
611
    src0_ptr -= 3;
612
    const_vec = __msa_ldi_h(128);
613
    const_vec <<= 6;
614
 
615
    filter_vec = LD_SH(filter);
616
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
617
 
618
    mask1 = mask0 + 2;
619
    mask2 = mask0 + 4;
620
    mask3 = mask0 + 6;
621
 
622
    for (loop_cnt = (height >> 1); loop_cnt--;) {
623
        LD_SB2(src0_ptr, 8, src0, src1);
624
        src0_ptr += src_stride;
625
        LD_SB2(src0_ptr, 8, src2, src3);
626
        src0_ptr += src_stride;
627
        LD_SH2(src1_ptr, 8, in0, in1);
628
        src1_ptr += src2_stride;
629
        LD_SH2(src1_ptr, 8, in2, in3);
630
        src1_ptr += src2_stride;
631
        XORI_B4_128_SB(src0, src1, src2, src3);
632
 
633
        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
634
                   vec0, vec1, vec2, vec3);
635
        dst0 = const_vec;
636
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
637
                     dst0, dst0, dst0, dst0);
638
        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
639
                   vec0, vec1, vec2, vec3);
640
        dst1 = const_vec;
641
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
642
                     dst1, dst1, dst1, dst1);
643
        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
644
                   vec0, vec1, vec2, vec3);
645
        dst2 = const_vec;
646
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
647
                     dst2, dst2, dst2, dst2);
648
        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
649
                   vec0, vec1, vec2, vec3);
650
        dst3 = const_vec;
651
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
652
                     dst3, dst3, dst3, dst3);
653
 
654
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
655
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
656
 
657
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
658
        ST_SH2(dst0, dst1, dst, dst_stride);
659
        dst += (2 * dst_stride);
660
    }
661
}
662
 
663
static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
664
                                  int32_t src_stride,
665
                                  int16_t *src1_ptr,
666
                                  int32_t src2_stride,
667
                                  uint8_t *dst,
668
                                  int32_t dst_stride,
669
                                  const int8_t *filter,
670
                                  int32_t height)
671
{
672
    uint32_t loop_cnt;
673
    uint64_t dst_val0;
674
    v16i8 src0, src1, tmp0, tmp1;
675
    v8i16 filt0, filt1, filt2, filt3;
676
    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
677
    v16i8 vec0, vec1, vec2, vec3;
678
    v8i16 dst0, dst1, dst2;
679
    v8i16 in0, in1, in2;
680
    v8i16 filter_vec, const_vec;
681
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
682
 
683
    src0_ptr = src0_ptr - 3;
684
    const_vec = __msa_ldi_h(128);
685
    const_vec <<= 6;
686
 
687
    filter_vec = LD_SH(filter);
688
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
689
 
690
    mask1 = mask0 + 2;
691
    mask2 = mask0 + 4;
692
    mask3 = mask0 + 6;
693
    mask4 = mask0 + 8;
694
    mask5 = mask0 + 10;
695
    mask6 = mask0 + 12;
696
    mask7 = mask0 + 14;
697
 
698
    for (loop_cnt = height; loop_cnt--;) {
699
        LD_SB2(src0_ptr, 16, src0, src1);
700
        src0_ptr += src_stride;
701
        LD_SH2(src1_ptr, 8, in0, in1);
702
        in2 = LD_SH(src1_ptr + 16);
703
        src1_ptr += src2_stride;
704
        XORI_B2_128_SB(src0, src1);
705
 
706
        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
707
                   vec0, vec1, vec2, vec3);
708
        dst0 = const_vec;
709
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
710
                     dst0, dst0, dst0, dst0);
711
        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
712
                   vec0, vec1, vec2, vec3);
713
        dst1 = const_vec;
714
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
715
                     dst1, dst1, dst1, dst1);
716
        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
717
                   vec0, vec1, vec2, vec3);
718
        dst2 = const_vec;
719
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
720
                     dst2, dst2, dst2, dst2);
721
 
722
        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
723
        dst2 = __msa_adds_s_h(dst2, in2);
724
        dst2 = __msa_srari_h(dst2, 7);
725
        dst2 = CLIP_SH_0_255(dst2);
726
 
727
        PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
728
        dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
729
        ST_SB(tmp0, dst);
730
        SD(dst_val0, dst + 16);
731
        dst += dst_stride;
732
    }
733
}
734
 
735
static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr,
736
                                  int32_t src_stride,
737
                                  int16_t *src1_ptr,
738
                                  int32_t src2_stride,
739
                                  uint8_t *dst,
740
                                  int32_t dst_stride,
741
                                  const int8_t *filter,
742
                                  int32_t height)
743
{
744
    uint32_t loop_cnt;
745
    v16i8 src0, src1, src2, tmp0, tmp1;
746
    v8i16 filt0, filt1, filt2, filt3;
747
    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
748
    v16i8 vec0, vec1, vec2, vec3;
749
    v8i16 dst0, dst1, dst2, dst3;
750
    v8i16 in0, in1, in2, in3;
751
    v8i16 filter_vec, const_vec;
752
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
753
 
754
    src0_ptr -= 3;
755
    const_vec = __msa_ldi_h(128);
756
    const_vec <<= 6;
757
 
758
    filter_vec = LD_SH(filter);
759
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
760
 
761
    mask1 = mask0 + 2;
762
    mask2 = mask0 + 4;
763
    mask3 = mask0 + 6;
764
    mask4 = mask0 + 8;
765
    mask5 = mask0 + 10;
766
    mask6 = mask0 + 12;
767
    mask7 = mask0 + 14;
768
 
769
    for (loop_cnt = height; loop_cnt--;) {
770
        LD_SB2(src0_ptr, 16, src0, src1);
771
        src2 = LD_SB(src0_ptr + 24);
772
        src0_ptr += src_stride;
773
        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
774
        src1_ptr += src2_stride;
775
        XORI_B3_128_SB(src0, src1, src2);
776
 
777
        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
778
                   vec0, vec1, vec2, vec3);
779
        dst0 = const_vec;
780
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
781
                     dst0, dst0, dst0, dst0);
782
        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
783
                   vec0, vec1, vec2, vec3);
784
        dst1 = const_vec;
785
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
786
                     dst1, dst1, dst1, dst1);
787
        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
788
                   vec0, vec1, vec2, vec3);
789
        dst2 = const_vec;
790
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
791
                     dst2, dst2, dst2, dst2);
792
        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
793
                   vec0, vec1, vec2, vec3);
794
        dst3 = const_vec;
795
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
796
                     dst3, dst3, dst3, dst3);
797
 
798
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
799
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
800
 
801
        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
802
        ST_SB2(tmp0, tmp1, dst, 16);
803
        dst += dst_stride;
804
    }
805
}
806
 
807
static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr,
808
                                  int32_t src_stride,
809
                                  int16_t *src1_ptr,
810
                                  int32_t src2_stride,
811
                                  uint8_t *dst,
812
                                  int32_t dst_stride,
813
                                  const int8_t *filter,
814
                                  int32_t height)
815
{
816
    uint32_t loop_cnt;
817
    v16i8 src0, src1, src2, src3;
818
    v16i8 tmp0, tmp1, tmp2;
819
    v8i16 filt0, filt1, filt2, filt3;
820
    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
821
    v16i8 vec0, vec1, vec2, vec3;
822
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
823
    v8i16 in0, in1, in2, in3, in4, in5;
824
    v8i16 filter_vec, const_vec;
825
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
826
 
827
    src0_ptr -= 3;
828
 
829
    const_vec = __msa_ldi_h(128);
830
    const_vec <<= 6;
831
 
832
    filter_vec = LD_SH(filter);
833
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
834
 
835
    mask1 = mask0 + 2;
836
    mask2 = mask0 + 4;
837
    mask3 = mask0 + 6;
838
    mask4 = mask0 + 8;
839
    mask5 = mask0 + 10;
840
    mask6 = mask0 + 12;
841
    mask7 = mask0 + 14;
842
 
843
    for (loop_cnt = height; loop_cnt--;) {
844
        LD_SB2(src0_ptr, 16, src0, src1);
845
        XORI_B2_128_SB(src0, src1);
846
        LD_SH2(src1_ptr, 8, in0, in1);
847
 
848
        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
849
                   vec0, vec1, vec2, vec3);
850
        dst0 = const_vec;
851
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
852
                     dst0, dst0, dst0, dst0);
853
        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
854
                   vec0, vec1, vec2, vec3);
855
        dst1 = const_vec;
856
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
857
                     dst1, dst1, dst1, dst1);
858
 
859
        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
860
 
861
        tmp0 = __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
862
        ST_SB(tmp0, dst);
863
 
864
        LD_SB2(src0_ptr + 32, 8, src2, src3);
865
        XORI_B2_128_SB(src2, src3);
866
        src0_ptr += src_stride;
867
 
868
        LD_SH2(src1_ptr + 16, 8, in2, in3);
869
 
870
        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
871
                   vec0, vec1, vec2, vec3);
872
        dst2 = const_vec;
873
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
874
                     dst2, dst2, dst2, dst2);
875
        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
876
                   vec0, vec1, vec2, vec3);
877
        dst3 = const_vec;
878
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
879
                     dst3, dst3, dst3, dst3);
880
 
881
        HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
882
 
883
        tmp1 = __msa_pckev_b((v16i8) dst3, (v16i8) dst2);
884
        ST_SB(tmp1, dst + 16);
885
 
886
        LD_SH2(src1_ptr + 32, 8, in4, in5);
887
        src1_ptr += src2_stride;
888
 
889
        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
890
                   vec0, vec1, vec2, vec3);
891
        dst4 = const_vec;
892
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
893
                     dst4, dst4, dst4, dst4);
894
        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
895
                   vec0, vec1, vec2, vec3);
896
        dst5 = const_vec;
897
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
898
                     dst5, dst5, dst5, dst5);
899
 
900
        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
901
 
902
        tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
903
        ST_SB(tmp2, dst + 32);
904
        dst += dst_stride;
905
    }
906
}
907
 
908
static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr,
909
                                  int32_t src_stride,
910
                                  int16_t *src1_ptr,
911
                                  int32_t src2_stride,
912
                                  uint8_t *dst,
913
                                  int32_t dst_stride,
914
                                  const int8_t *filter,
915
                                  int32_t height)
916
{
917
    uint8_t *src0_ptr_tmp;
918
    uint8_t *dst_tmp;
919
    int16_t *src1_ptr_tmp;
920
    uint32_t loop_cnt;
921
    uint32_t cnt;
922
    v16i8 src0, src1, src2, tmp0, tmp1;
923
    v8i16 filt0, filt1, filt2, filt3;
924
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
925
    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
926
    v16i8 vec0, vec1, vec2, vec3;
927
    v8i16 dst0, dst1, dst2, dst3;
928
    v8i16 in0, in1, in2, in3;
929
    v8i16 filter_vec, const_vec;
930
 
931
    src0_ptr -= 3;
932
 
933
    const_vec = __msa_ldi_h(128);
934
    const_vec <<= 6;
935
 
936
    filter_vec = LD_SH(filter);
937
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
938
 
939
    mask1 = mask0 + 2;
940
    mask2 = mask0 + 4;
941
    mask3 = mask0 + 6;
942
    mask4 = mask0 + 8;
943
    mask5 = mask0 + 10;
944
    mask6 = mask0 + 12;
945
    mask7 = mask0 + 14;
946
 
947
    for (loop_cnt = height; loop_cnt--;) {
948
        src0_ptr_tmp = src0_ptr;
949
        dst_tmp = dst;
950
        src1_ptr_tmp = src1_ptr;
951
 
952
        for (cnt = 2; cnt--;) {
953
            LD_SB2(src0_ptr_tmp, 16, src0, src1);
954
            src2 = LD_SB(src0_ptr_tmp + 24);
955
            src0_ptr_tmp += 32;
956
            LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
957
            src1_ptr_tmp += 32;
958
            XORI_B3_128_SB(src0, src1, src2);
959
 
960
            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
961
                       vec0, vec1, vec2, vec3);
962
            dst0 = const_vec;
963
            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
964
                         dst0, dst0, dst0, dst0);
965
            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
966
                       vec0, vec1, vec2, vec3);
967
            dst1 = const_vec;
968
            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
969
                         dst1, dst1, dst1, dst1);
970
            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
971
                       vec0, vec1, vec2, vec3);
972
            dst2 = const_vec;
973
            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
974
                         dst2, dst2, dst2, dst2);
975
            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
976
                       vec0, vec1, vec2, vec3);
977
            dst3 = const_vec;
978
            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
979
                         dst3, dst3, dst3, dst3);
980
 
981
            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
982
                              dst0, dst1, dst2, dst3, 7,
983
                              dst0, dst1, dst2, dst3);
984
 
985
            PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
986
            ST_SB2(tmp0, tmp1, dst_tmp, 16);
987
            dst_tmp += 32;
988
        }
989
 
990
        src1_ptr += src2_stride;
991
        src0_ptr += src_stride;
992
        dst += dst_stride;
993
    }
994
}
995
 
996
static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
997
                                 int32_t src_stride,
998
                                 int16_t *src1_ptr,
999
                                 int32_t src2_stride,
1000
                                 uint8_t *dst,
1001
                                 int32_t dst_stride,
1002
                                 const int8_t *filter,
1003
                                 int32_t height)
1004
{
1005
    int32_t loop_cnt;
1006
    v16i8 src0, src1, src2, src3, src4, src5;
1007
    v16i8 src6, src7, src8, src9, src10;
1008
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1009
    v16i8 src11, src12, src13, src14;
1010
    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1011
    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1012
    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1013
    v16i8 src2110, src4332, src6554, src8776, src10998;
1014
    v16i8 src12111110, src14131312;
1015
    v8i16 dst10, dst32, dst54, dst76;
1016
    v8i16 filt0, filt1, filt2, filt3;
1017
    v8i16 filter_vec, const_vec;
1018
 
1019
    src0_ptr -= (3 * src_stride);
1020
 
1021
    const_vec = __msa_ldi_h(128);
1022
    const_vec <<= 6;
1023
 
1024
    filter_vec = LD_SH(filter);
1025
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1026
 
1027
    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1028
    src0_ptr += (7 * src_stride);
1029
    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1030
               src10_r, src32_r, src54_r, src21_r);
1031
    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1032
    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1033
               src2110, src4332, src6554);
1034
    XORI_B3_128_SB(src2110, src4332, src6554);
1035
 
1036
    for (loop_cnt = (height >> 3); loop_cnt--;) {
1037
        LD_SB8(src0_ptr, src_stride,
1038
               src7, src8, src9, src10, src11, src12, src13, src14);
1039
        src0_ptr += (8 * src_stride);
1040
        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1041
        src1_ptr += (8 * src2_stride);
1042
 
1043
        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1044
        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1045
        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1046
                   src76_r, src87_r, src98_r, src109_r);
1047
        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1048
                   src1110_r, src1211_r, src1312_r, src1413_r);
1049
        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1050
                   src1413_r, src1312_r,
1051
                   src8776, src10998, src12111110, src14131312);
1052
        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1053
 
1054
        dst10 = const_vec;
1055
        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1056
                     filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1057
        dst32 = const_vec;
1058
        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1059
                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1060
        dst54 = const_vec;
1061
        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1062
                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1063
        dst76 = const_vec;
1064
        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1065
                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1066
 
1067
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1068
                          dst10, dst32, dst54, dst76, 7,
1069
                          dst10, dst32, dst54, dst76);
1070
 
1071
        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1072
        ST4x8_UB(dst10, dst54, dst, dst_stride);
1073
        dst += (8 * dst_stride);
1074
 
1075
        src2110 = src10998;
1076
        src4332 = src12111110;
1077
        src6554 = src14131312;
1078
        src6 = src14;
1079
    }
1080
}
1081
 
1082
static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
1083
                                 int32_t src_stride,
1084
                                 int16_t *src1_ptr,
1085
                                 int32_t src2_stride,
1086
                                 uint8_t *dst,
1087
                                 int32_t dst_stride,
1088
                                 const int8_t *filter,
1089
                                 int32_t height)
1090
{
1091
    int32_t loop_cnt;
1092
    v16i8 src0, src1, src2, src3, src4, src5;
1093
    v16i8 src6, src7, src8, src9, src10;
1094
    v8i16 in0, in1, in2, in3;
1095
    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1096
    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1097
    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1098
    v8i16 filt0, filt1, filt2, filt3;
1099
    v8i16 filter_vec, const_vec;
1100
 
1101
    src0_ptr -= (3 * src_stride);
1102
    const_vec = __msa_ldi_h(128);
1103
    const_vec <<= 6;
1104
 
1105
    filter_vec = LD_SH(filter);
1106
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1107
 
1108
    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1109
    src0_ptr += (7 * src_stride);
1110
    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1111
    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1112
               src10_r, src32_r, src54_r, src21_r);
1113
    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1114
 
1115
    for (loop_cnt = (height >> 2); loop_cnt--;) {
1116
        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1117
        src0_ptr += (4 * src_stride);
1118
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1119
        src1_ptr += (4 * src2_stride);
1120
        XORI_B4_128_SB(src7, src8, src9, src10);
1121
        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1122
                   src76_r, src87_r, src98_r, src109_r);
1123
 
1124
        dst0_r = const_vec;
1125
        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1126
                     filt0, filt1, filt2, filt3,
1127
                     dst0_r, dst0_r, dst0_r, dst0_r);
1128
        dst1_r = const_vec;
1129
        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1130
                     filt0, filt1, filt2, filt3,
1131
                     dst1_r, dst1_r, dst1_r, dst1_r);
1132
        dst2_r = const_vec;
1133
        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1134
                     filt0, filt1, filt2, filt3,
1135
                     dst2_r, dst2_r, dst2_r, dst2_r);
1136
        dst3_r = const_vec;
1137
        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1138
                     filt0, filt1, filt2, filt3,
1139
                     dst3_r, dst3_r, dst3_r, dst3_r);
1140
 
1141
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1142
                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
1143
                          dst0_r, dst1_r, dst2_r, dst3_r);
1144
 
1145
        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1146
        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1147
        dst += (4 * dst_stride);
1148
 
1149
        src10_r = src54_r;
1150
        src32_r = src76_r;
1151
        src54_r = src98_r;
1152
        src21_r = src65_r;
1153
        src43_r = src87_r;
1154
        src65_r = src109_r;
1155
 
1156
        src6 = src10;
1157
    }
1158
}
1159
 
1160
static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
1161
                                  int32_t src_stride,
1162
                                  int16_t *src1_ptr,
1163
                                  int32_t src2_stride,
1164
                                  uint8_t *dst,
1165
                                  int32_t dst_stride,
1166
                                  const int8_t *filter,
1167
                                  int32_t height)
1168
{
1169
    int32_t loop_cnt;
1170
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1171
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1172
    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1173
    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1174
    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1175
    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1176
    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1177
    v16i8 src2110, src4332, src6554, src8776, src10998;
1178
    v8i16 dst0_l, dst1_l;
1179
    v8i16 filt0, filt1, filt2, filt3;
1180
    v8i16 filter_vec, const_vec;
1181
 
1182
    src0_ptr -= (3 * src_stride);
1183
    const_vec = __msa_ldi_h(128);
1184
    const_vec <<= 6;
1185
 
1186
    filter_vec = LD_SH(filter);
1187
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1188
 
1189
    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1190
    src0_ptr += (7 * src_stride);
1191
    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1192
 
1193
    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1194
               src10_r, src32_r, src54_r, src21_r);
1195
    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1196
    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1197
               src10_l, src32_l, src54_l, src21_l);
1198
    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1199
    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1200
               src2110, src4332, src6554);
1201
 
1202
    for (loop_cnt = (height >> 2); loop_cnt--;) {
1203
        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1204
        src0_ptr += (4 * src_stride);
1205
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1206
        LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1207
        src1_ptr += (4 * src2_stride);
1208
 
1209
        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
1210
        XORI_B4_128_SB(src7, src8, src9, src10);
1211
        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1212
                   src76_r, src87_r, src98_r, src109_r);
1213
        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1214
                   src76_l, src87_l, src98_l, src109_l);
1215
        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1216
 
1217
        dst0_r = const_vec;
1218
        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1219
                     filt0, filt1, filt2, filt3,
1220
                     dst0_r, dst0_r, dst0_r, dst0_r);
1221
        dst1_r = const_vec;
1222
        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1223
                     filt0, filt1, filt2, filt3,
1224
                     dst1_r, dst1_r, dst1_r, dst1_r);
1225
        dst2_r = const_vec;
1226
        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1227
                     filt0, filt1, filt2, filt3,
1228
                     dst2_r, dst2_r, dst2_r, dst2_r);
1229
        dst3_r = const_vec;
1230
        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1231
                     filt0, filt1, filt2, filt3,
1232
                     dst3_r, dst3_r, dst3_r, dst3_r);
1233
        dst0_l = const_vec;
1234
        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1235
                     filt0, filt1, filt2, filt3,
1236
                     dst0_l, dst0_l, dst0_l, dst0_l);
1237
        dst1_l = const_vec;
1238
        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1239
                     filt0, filt1, filt2, filt3,
1240
                     dst1_l, dst1_l, dst1_l, dst1_l);
1241
 
1242
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1243
                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
1244
                          dst0_r, dst1_r, dst2_r, dst3_r);
1245
        HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
1246
 
1247
 
1248
        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1249
        dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1250
        ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
1251
        dst += (4 * dst_stride);
1252
 
1253
        src10_r = src54_r;
1254
        src32_r = src76_r;
1255
        src54_r = src98_r;
1256
        src21_r = src65_r;
1257
        src43_r = src87_r;
1258
        src65_r = src109_r;
1259
        src2110 = src6554;
1260
        src4332 = src8776;
1261
        src6554 = src10998;
1262
        src6 = src10;
1263
    }
1264
}
1265
 
1266
static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr,
1267
                                           int32_t src_stride,
1268
                                           int16_t *src1_ptr,
1269
                                           int32_t src2_stride,
1270
                                           uint8_t *dst,
1271
                                           int32_t dst_stride,
1272
                                           const int8_t *filter,
1273
                                           int32_t height, int32_t width)
1274
{
1275
    uint8_t *src0_ptr_tmp;
1276
    int16_t *src1_ptr_tmp;
1277
    uint8_t *dst_tmp;
1278
    uint32_t loop_cnt;
1279
    uint32_t cnt;
1280
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1281
    v8i16 in0, in1, in2, in3;
1282
    v16i8 src10_r, src32_r, src54_r, src76_r;
1283
    v16i8 src21_r, src43_r, src65_r, src87_r;
1284
    v8i16 dst0_r, dst1_r;
1285
    v16i8 src10_l, src32_l, src54_l, src76_l;
1286
    v16i8 src21_l, src43_l, src65_l, src87_l;
1287
    v8i16 dst0_l, dst1_l;
1288
    v8i16 filt0, filt1, filt2, filt3;
1289
    v8i16 filter_vec, const_vec;
1290
 
1291
    src0_ptr -= (3 * src_stride);
1292
    const_vec = __msa_ldi_h(128);
1293
    const_vec <<= 6;
1294
 
1295
    filter_vec = LD_SH(filter);
1296
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1297
 
1298
    for (cnt = (width >> 4); cnt--;) {
1299
        src0_ptr_tmp = src0_ptr;
1300
        src1_ptr_tmp = src1_ptr;
1301
        dst_tmp = dst;
1302
 
1303
        LD_SB7(src0_ptr_tmp, src_stride,
1304
               src0, src1, src2, src3, src4, src5, src6);
1305
        src0_ptr_tmp += (7 * src_stride);
1306
        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1307
 
1308
        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1309
                   src10_r, src32_r, src54_r, src21_r);
1310
        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1311
        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1312
                   src10_l, src32_l, src54_l, src21_l);
1313
        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1314
 
1315
        for (loop_cnt = (height >> 1); loop_cnt--;) {
1316
            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1317
            src0_ptr_tmp += (2 * src_stride);
1318
            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1319
            LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1320
            src1_ptr_tmp += (2 * src2_stride);
1321
            XORI_B2_128_SB(src7, src8);
1322
 
1323
            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1324
            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1325
 
1326
            dst0_r = const_vec;
1327
            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1328
                         filt0, filt1, filt2, filt3,
1329
                         dst0_r, dst0_r, dst0_r, dst0_r);
1330
            dst1_r = const_vec;
1331
            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1332
                         filt0, filt1, filt2, filt3,
1333
                         dst1_r, dst1_r, dst1_r, dst1_r);
1334
            dst0_l = const_vec;
1335
            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1336
                         filt0, filt1, filt2, filt3,
1337
                         dst0_l, dst0_l, dst0_l, dst0_l);
1338
            dst1_l = const_vec;
1339
            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1340
                         filt0, filt1, filt2, filt3,
1341
                         dst1_l, dst1_l, dst1_l, dst1_l);
1342
 
1343
            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1344
                              dst0_r, dst1_r, dst0_l, dst1_l, 7,
1345
                              dst0_r, dst1_r, dst0_l, dst1_l);
1346
 
1347
            PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1348
            ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1349
            dst_tmp += (2 * dst_stride);
1350
 
1351
            src10_r = src32_r;
1352
            src32_r = src54_r;
1353
            src54_r = src76_r;
1354
            src21_r = src43_r;
1355
            src43_r = src65_r;
1356
            src65_r = src87_r;
1357
            src10_l = src32_l;
1358
            src32_l = src54_l;
1359
            src54_l = src76_l;
1360
            src21_l = src43_l;
1361
            src43_l = src65_l;
1362
            src65_l = src87_l;
1363
            src6 = src8;
1364
        }
1365
 
1366
        src0_ptr += 16;
1367
        src1_ptr += 16;
1368
        dst += 16;
1369
    }
1370
}
1371
 
1372
static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr,
1373
                                  int32_t src_stride,
1374
                                  int16_t *src1_ptr,
1375
                                  int32_t src2_stride,
1376
                                  uint8_t *dst,
1377
                                  int32_t dst_stride,
1378
                                  const int8_t *filter,
1379
                                  int32_t height)
1380
{
1381
    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1382
                                   dst, dst_stride, filter, height, 16);
1383
}
1384
 
1385
static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr,
1386
                                  int32_t src_stride,
1387
                                  int16_t *src1_ptr,
1388
                                  int32_t src2_stride,
1389
                                  uint8_t *dst,
1390
                                  int32_t dst_stride,
1391
                                  const int8_t *filter,
1392
                                  int32_t height)
1393
{
1394
    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1395
                                   dst, dst_stride, filter, height, 16);
1396
    hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
1397
                         dst + 16, dst_stride, filter, height);
1398
}
1399
 
1400
static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr,
1401
                                  int32_t src_stride,
1402
                                  int16_t *src1_ptr,
1403
                                  int32_t src2_stride,
1404
                                  uint8_t *dst,
1405
                                  int32_t dst_stride,
1406
                                  const int8_t *filter,
1407
                                  int32_t height)
1408
{
1409
    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1410
                                   dst, dst_stride, filter, height, 32);
1411
}
1412
 
1413
static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr,
1414
                                  int32_t src_stride,
1415
                                  int16_t *src1_ptr,
1416
                                  int32_t src2_stride,
1417
                                  uint8_t *dst,
1418
                                  int32_t dst_stride,
1419
                                  const int8_t *filter,
1420
                                  int32_t height)
1421
{
1422
    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1423
                                   dst, dst_stride, filter, height, 48);
1424
}
1425
 
1426
static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr,
1427
                                  int32_t src_stride,
1428
                                  int16_t *src1_ptr,
1429
                                  int32_t src2_stride,
1430
                                  uint8_t *dst,
1431
                                  int32_t dst_stride,
1432
                                  const int8_t *filter,
1433
                                  int32_t height)
1434
{
1435
    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1436
                                   dst, dst_stride, filter, height, 64);
1437
}
1438
 
1439
static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
1440
                                 int32_t src_stride,
1441
                                 int16_t *src1_ptr,
1442
                                 int32_t src2_stride,
1443
                                 uint8_t *dst,
1444
                                 int32_t dst_stride,
1445
                                 const int8_t *filter_x,
1446
                                 const int8_t *filter_y,
1447
                                 int32_t height)
1448
{
1449
    uint32_t loop_cnt;
1450
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1451
    v8i16 in0, in1;
1452
    v8i16 filt0, filt1, filt2, filt3;
1453
    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1454
    v16i8 mask1, mask2, mask3;
1455
    v8i16 filter_vec, const_vec;
1456
    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1457
    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1458
    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1459
    v4i32 dst0_r, dst1_r, in0_r, in0_l;
1460
    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1461
    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1462
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1463
    v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1464
 
1465
    src0_ptr -= ((3 * src_stride) + 3);
1466
    filter_vec = LD_SH(filter_x);
1467
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1468
 
1469
    filter_vec = LD_SH(filter_y);
1470
    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1471
    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1472
 
1473
    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1474
 
1475
    mask1 = mask0 + 2;
1476
    mask2 = mask0 + 4;
1477
    mask3 = mask0 + 6;
1478
 
1479
    const_vec = __msa_ldi_h(128);
1480
    const_vec <<= 6;
1481
 
1482
    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1483
    src0_ptr += (7 * src_stride);
1484
    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1485
 
1486
    /* row 0 row 1 row 2 row 3 */
1487
    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1488
    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1489
    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1490
               vec8, vec9, vec10, vec11);
1491
    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1492
               vec12, vec13, vec14, vec15);
1493
 
1494
    dst30 = const_vec;
1495
    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1496
                 dst30, dst30, dst30, dst30);
1497
    dst41 = const_vec;
1498
    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1499
                 dst41, dst41, dst41, dst41);
1500
    dst52 = const_vec;
1501
    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1502
                 dst52, dst52, dst52, dst52);
1503
    dst63 = const_vec;
1504
    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1505
                 dst63, dst63, dst63, dst63);
1506
 
1507
    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1508
               dst10_r, dst21_r, dst32_r);
1509
    dst43_r = __msa_ilvl_h(dst41, dst30);
1510
    dst54_r = __msa_ilvl_h(dst52, dst41);
1511
    dst65_r = __msa_ilvl_h(dst63, dst52);
1512
    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1513
 
1514
    for (loop_cnt = height >> 1; loop_cnt--;) {
1515
        LD_SB2(src0_ptr, src_stride, src7, src8);
1516
        src0_ptr += (2 * src_stride);
1517
        LD_SH2(src1_ptr, src2_stride, in0, in1);
1518
        src1_ptr += (2 * src2_stride);
1519
 
1520
        in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
1521
        XORI_B2_128_SB(src7, src8);
1522
 
1523
        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1524
                   vec0, vec1, vec2, vec3);
1525
        dst87 = const_vec;
1526
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1527
                     dst87, dst87, dst87, dst87);
1528
        dst76_r = __msa_ilvr_h(dst87, dst66);
1529
        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1530
                                filt_h0, filt_h1, filt_h2, filt_h3);
1531
        dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1532
        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1533
                                filt_h0, filt_h1, filt_h2, filt_h3);
1534
 
1535
        dst0_r >>= 6;
1536
        dst1_r >>= 6;
1537
        UNPCK_SH_SW(in0, in0_r, in0_l);
1538
        dst0_r = __msa_adds_s_w(dst0_r, in0_r);
1539
        dst1_r = __msa_adds_s_w(dst1_r, in0_l);
1540
        SRARI_W2_SW(dst0_r, dst1_r, 7);
1541
        dst0_r = CLIP_SW_0_255(dst0_r);
1542
        dst1_r = CLIP_SW_0_255(dst1_r);
1543
 
1544
        HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
1545
        ST4x2_UB(dst0_r, dst, dst_stride);
1546
        dst += (2 * dst_stride);
1547
 
1548
        dst10_r = dst32_r;
1549
        dst32_r = dst54_r;
1550
        dst54_r = dst76_r;
1551
        dst21_r = dst43_r;
1552
        dst43_r = dst65_r;
1553
        dst65_r = dst87_r;
1554
        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1555
    }
1556
}
1557
 
1558
static void hevc_hv_bi_8t_8multx2mult_msa(uint8_t *src0_ptr,
1559
                                          int32_t src_stride,
1560
                                          int16_t *src1_ptr,
1561
                                          int32_t src2_stride,
1562
                                          uint8_t *dst,
1563
                                          int32_t dst_stride,
1564
                                          const int8_t *filter_x,
1565
                                          const int8_t *filter_y,
1566
                                          int32_t height, int32_t width)
1567
{
1568
    uint32_t loop_cnt;
1569
    uint32_t cnt;
1570
    uint8_t *src0_ptr_tmp;
1571
    int16_t *src1_ptr_tmp;
1572
    uint8_t *dst_tmp;
1573
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1574
    v8i16 in0, in1;
1575
    v4i32 in0_r, in0_l, in1_r, in1_l;
1576
    v8i16 filt0, filt1, filt2, filt3;
1577
    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1578
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1579
    v16i8 mask1, mask2, mask3;
1580
    v8i16 filter_vec, const_vec;
1581
    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1582
    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1583
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1584
    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1585
    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1586
    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1587
    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1588
    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1589
 
1590
    src0_ptr -= ((3 * src_stride) + 3);
1591
    const_vec = __msa_ldi_h(128);
1592
    const_vec <<= 6;
1593
 
1594
    filter_vec = LD_SH(filter_x);
1595
    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1596
 
1597
    filter_vec = LD_SH(filter_y);
1598
    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1599
    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1600
    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1601
 
1602
    mask1 = mask0 + 2;
1603
    mask2 = mask0 + 4;
1604
    mask3 = mask0 + 6;
1605
 
1606
    for (cnt = width >> 3; cnt--;) {
1607
        src0_ptr_tmp = src0_ptr;
1608
        dst_tmp = dst;
1609
        src1_ptr_tmp = src1_ptr;
1610
 
1611
        LD_SB7(src0_ptr_tmp, src_stride,
1612
               src0, src1, src2, src3, src4, src5, src6);
1613
        src0_ptr_tmp += (7 * src_stride);
1614
        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1615
 
1616
        /* row 0 row 1 row 2 row 3 */
1617
        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1618
                   vec0, vec1, vec2, vec3);
1619
        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1620
                   vec4, vec5, vec6, vec7);
1621
        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1622
                   vec8, vec9, vec10, vec11);
1623
        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1624
                   vec12, vec13, vec14, vec15);
1625
        dst0 = const_vec;
1626
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1627
                     dst0, dst0, dst0, dst0);
1628
        dst1 = const_vec;
1629
        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1630
                     dst1, dst1, dst1, dst1);
1631
        dst2 = const_vec;
1632
        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1633
                     dst2, dst2, dst2, dst2);
1634
        dst3 = const_vec;
1635
        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1636
                     dst3, dst3, dst3, dst3);
1637
 
1638
        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1639
                   vec0, vec1, vec2, vec3);
1640
        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1641
                   vec4, vec5, vec6, vec7);
1642
        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1643
                   vec8, vec9, vec10, vec11);
1644
        dst4 = const_vec;
1645
        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1646
                     dst4, dst4, dst4, dst4);
1647
        dst5 = const_vec;
1648
        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1649
                     dst5, dst5, dst5, dst5);
1650
        dst6 = const_vec;
1651
        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1652
                     dst6, dst6, dst6, dst6);
1653
 
1654
        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1655
                   dst10_r, dst32_r, dst54_r, dst21_r);
1656
        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1657
        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1658
                   dst10_l, dst32_l, dst54_l, dst21_l);
1659
        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1660
 
1661
        for (loop_cnt = height >> 1; loop_cnt--;) {
1662
            /* row 7 */
1663
            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1664
            XORI_B2_128_SB(src7, src8);
1665
            src0_ptr_tmp += 2 * src_stride;
1666
 
1667
            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1668
            src1_ptr_tmp += (2 * src2_stride);
1669
 
1670
            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1671
                       vec0, vec1, vec2, vec3);
1672
            dst7 = const_vec;
1673
            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1674
                         dst7, dst7, dst7, dst7);
1675
 
1676
            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1677
            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1678
                                    filt_h0, filt_h1, filt_h2, filt_h3);
1679
            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1680
                                    filt_h0, filt_h1, filt_h2, filt_h3);
1681
            dst0_r >>= 6;
1682
            dst0_l >>= 6;
1683
 
1684
            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1685
                       vec0, vec1, vec2, vec3);
1686
            dst8 = const_vec;
1687
            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1688
                         dst8, dst8, dst8, dst8);
1689
 
1690
            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1691
            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1692
                                    filt_h0, filt_h1, filt_h2, filt_h3);
1693
            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1694
                                    filt_h0, filt_h1, filt_h2, filt_h3);
1695
            dst1_r >>= 6;
1696
            dst1_l >>= 6;
1697
 
1698
            UNPCK_SH_SW(in0, in0_r, in0_l);
1699
            UNPCK_SH_SW(in1, in1_r, in1_l);
1700
            in0_r = __msa_adds_s_w(in0_r, dst0_r);
1701
            in0_l = __msa_adds_s_w(in0_l, dst0_l);
1702
            in1_r = __msa_adds_s_w(in1_r, dst1_r);
1703
            in1_l = __msa_adds_s_w(in1_l, dst1_l);
1704
            SRARI_W4_SW(in0_r, in0_l, in1_r, in1_l, 7);
1705
            in0_r = CLIP_SW_0_255(in0_r);
1706
            in0_l = CLIP_SW_0_255(in0_l);
1707
            in1_r = CLIP_SW_0_255(in1_r);
1708
            in1_l = CLIP_SW_0_255(in1_l);
1709
 
1710
            HEVC_PCK_SW_SB4(in0_l, in0_r, in1_l, in1_r, dst0_r);
1711
            ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1712
            dst_tmp += (2 * dst_stride);
1713
 
1714
            dst10_r = dst32_r;
1715
            dst32_r = dst54_r;
1716
            dst54_r = dst76_r;
1717
            dst10_l = dst32_l;
1718
            dst32_l = dst54_l;
1719
            dst54_l = dst76_l;
1720
            dst21_r = dst43_r;
1721
            dst43_r = dst65_r;
1722
            dst65_r = dst87_r;
1723
            dst21_l = dst43_l;
1724
            dst43_l = dst65_l;
1725
            dst65_l = dst87_l;
1726
            dst6 = dst8;
1727
        }
1728
 
1729
        src0_ptr += 8;
1730
        dst += 8;
1731
        src1_ptr += 8;
1732
    }
1733
}
1734
 
1735
static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr,
1736
                                 int32_t src_stride,
1737
                                 int16_t *src1_ptr,
1738
                                 int32_t src2_stride,
1739
                                 uint8_t *dst,
1740
                                 int32_t dst_stride,
1741
                                 const int8_t *filter_x,
1742
                                 const int8_t *filter_y,
1743
                                 int32_t height)
1744
{
1745
    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1746
                                  dst, dst_stride, filter_x, filter_y,
1747
                                  height, 8);
1748
}
1749
 
1750
static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
1751
                                  int32_t src_stride,
1752
                                  int16_t *src1_ptr,
1753
                                  int32_t src2_stride,
1754
                                  uint8_t *dst,
1755
                                  int32_t dst_stride,
1756
                                  const int8_t *filter_x,
1757
                                  const int8_t *filter_y,
1758
                                  int32_t height)
1759
{
1760
    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1761
                                  dst, dst_stride, filter_x, filter_y,
1762
                                  height, 8);
1763
 
1764
    hevc_hv_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
1765
                         dst + 8, dst_stride, filter_x, filter_y, height);
1766
}
1767
 
1768
static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr,
1769
                                  int32_t src_stride,
1770
                                  int16_t *src1_ptr,
1771
                                  int32_t src2_stride,
1772
                                  uint8_t *dst,
1773
                                  int32_t dst_stride,
1774
                                  const int8_t *filter_x,
1775
                                  const int8_t *filter_y,
1776
                                  int32_t height)
1777
{
1778
    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1779
                                  dst, dst_stride, filter_x, filter_y,
1780
                                  height, 16);
1781
}
1782
 
1783
static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr,
1784
                                  int32_t src_stride,
1785
                                  int16_t *src1_ptr,
1786
                                  int32_t src2_stride,
1787
                                  uint8_t *dst,
1788
                                  int32_t dst_stride,
1789
                                  const int8_t *filter_x,
1790
                                  const int8_t *filter_y,
1791
                                  int32_t height)
1792
{
1793
    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1794
                                  dst, dst_stride, filter_x, filter_y,
1795
                                  height, 24);
1796
}
1797
 
1798
static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr,
1799
                                  int32_t src_stride,
1800
                                  int16_t *src1_ptr,
1801
                                  int32_t src2_stride,
1802
                                  uint8_t *dst,
1803
                                  int32_t dst_stride,
1804
                                  const int8_t *filter_x,
1805
                                  const int8_t *filter_y,
1806
                                  int32_t height)
1807
{
1808
    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1809
                                  dst, dst_stride, filter_x, filter_y,
1810
                                  height, 32);
1811
}
1812
 
1813
static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr,
1814
                                  int32_t src_stride,
1815
                                  int16_t *src1_ptr,
1816
                                  int32_t src2_stride,
1817
                                  uint8_t *dst,
1818
                                  int32_t dst_stride,
1819
                                  const int8_t *filter_x,
1820
                                  const int8_t *filter_y,
1821
                                  int32_t height)
1822
{
1823
    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1824
                                  dst, dst_stride, filter_x, filter_y,
1825
                                  height, 48);
1826
}
1827
 
1828
static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr,
1829
                                  int32_t src_stride,
1830
                                  int16_t *src1_ptr,
1831
                                  int32_t src2_stride,
1832
                                  uint8_t *dst,
1833
                                  int32_t dst_stride,
1834
                                  const int8_t *filter_x,
1835
                                  const int8_t *filter_y,
1836
                                  int32_t height)
1837
{
1838
    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1839
                                  dst, dst_stride, filter_x, filter_y,
1840
                                  height, 64);
1841
}
1842
 
1843
static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
1844
                                  int32_t src_stride,
1845
                                  int16_t *src1_ptr,
1846
                                  int32_t src2_stride,
1847
                                  uint8_t *dst,
1848
                                  int32_t dst_stride,
1849
                                  const int8_t *filter,
1850
                                  int32_t height)
1851
{
1852
    v8i16 filt0, filt1;
1853
    v16i8 src0, src1, dst0, vec0, vec1;
1854
    v8i16 in0, in1;
1855
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1856
    v16i8 mask1;
1857
    v8i16 tmp0;
1858
    v8i16 filter_vec, const_vec;
1859
 
1860
    src0_ptr -= 1;
1861
 
1862
    const_vec = __msa_ldi_h(128);
1863
    const_vec <<= 6;
1864
 
1865
    filter_vec = LD_SH(filter);
1866
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1867
 
1868
    mask1 = mask0 + 2;
1869
 
1870
    LD_SB2(src0_ptr, src_stride, src0, src1);
1871
    LD_SH2(src1_ptr, src2_stride, in0, in1);
1872
    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
1873
    XORI_B2_128_SB(src0, src1);
1874
    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1875
    tmp0 = const_vec;
1876
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
1877
 
1878
    tmp0 = __msa_adds_s_h(tmp0, in0);
1879
    tmp0 = __msa_srari_h(tmp0, 7);
1880
    tmp0 = CLIP_SH_0_255(tmp0);
1881
    dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
1882
 
1883
    ST4x2_UB(dst0, dst, dst_stride);
1884
}
1885
 
1886
static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
1887
                                  int32_t src_stride,
1888
                                  int16_t *src1_ptr,
1889
                                  int32_t src2_stride,
1890
                                  uint8_t *dst,
1891
                                  int32_t dst_stride,
1892
                                  const int8_t *filter,
1893
                                  int32_t height)
1894
{
1895
    v8i16 filt0, filt1;
1896
    v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
1897
    v8i16 in0, in1, in2, in3;
1898
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1899
    v16i8 mask1;
1900
    v8i16 tmp0, tmp1;
1901
    v8i16 filter_vec, const_vec;
1902
 
1903
    src0_ptr -= 1;
1904
 
1905
    const_vec = __msa_ldi_h(128);
1906
    const_vec <<= 6;
1907
 
1908
    filter_vec = LD_SH(filter);
1909
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1910
 
1911
    mask1 = mask0 + 2;
1912
 
1913
    LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
1914
    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1915
 
1916
    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1917
    XORI_B4_128_SB(src0, src1, src2, src3);
1918
 
1919
    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1920
    tmp0 = const_vec;
1921
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
1922
    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1923
    tmp1 = const_vec;
1924
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
1925
    HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
1926
    dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1927
 
1928
    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1929
}
1930
 
1931
static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
1932
                                          int32_t src_stride,
1933
                                          int16_t *src1_ptr,
1934
                                          int32_t src2_stride,
1935
                                          uint8_t *dst,
1936
                                          int32_t dst_stride,
1937
                                          const int8_t *filter,
1938
                                          int32_t height)
1939
{
1940
    uint32_t loop_cnt;
1941
    v8i16 filt0, filt1;
1942
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1943
    v16i8 dst0, dst1;
1944
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1945
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1946
    v16i8 mask1, vec0, vec1;
1947
    v8i16 tmp0, tmp1, tmp2, tmp3;
1948
    v8i16 filter_vec, const_vec;
1949
 
1950
    src0_ptr -= 1;
1951
 
1952
    const_vec = __msa_ldi_h(128);
1953
    const_vec <<= 6;
1954
 
1955
    filter_vec = LD_SH(filter);
1956
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1957
 
1958
    mask1 = mask0 + 2;
1959
 
1960
    for (loop_cnt = (height >> 3); loop_cnt--;) {
1961
        LD_SB8(src0_ptr, src_stride,
1962
               src0, src1, src2, src3, src4, src5, src6, src7);
1963
        src0_ptr += (8 * src_stride);
1964
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1965
        src1_ptr += (4 * src2_stride);
1966
        LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
1967
        src1_ptr += (4 * src2_stride);
1968
        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1969
        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1970
        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1971
 
1972
        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1973
        tmp0 = const_vec;
1974
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
1975
        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1976
        tmp1 = const_vec;
1977
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
1978
        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
1979
        tmp2 = const_vec;
1980
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp2, tmp2);
1981
        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
1982
        tmp3 = const_vec;
1983
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp3, tmp3);
1984
 
1985
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1986
                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
1987
 
1988
        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
1989
        ST4x8_UB(dst0, dst1, dst, dst_stride);
1990
        dst += (8 * dst_stride);
1991
    }
1992
}
1993
 
1994
static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr,
1995
                                 int32_t src_stride,
1996
                                 int16_t *src1_ptr,
1997
                                 int32_t src2_stride,
1998
                                 uint8_t *dst,
1999
                                 int32_t dst_stride,
2000
                                 const int8_t *filter,
2001
                                 int32_t height)
2002
{
2003
    if (2 == height) {
2004
        hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2005
                              dst, dst_stride, filter, height);
2006
    } else if (4 == height) {
2007
        hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2008
                              dst, dst_stride, filter, height);
2009
    } else if (8 == height || 16 == height) {
2010
        hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
2011
                                      src1_ptr, src2_stride,
2012
                                      dst, dst_stride, filter, height);
2013
    }
2014
}
2015
 
2016
static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
2017
                                 int32_t src_stride,
2018
                                 int16_t *src1_ptr,
2019
                                 int32_t src2_stride,
2020
                                 uint8_t *dst,
2021
                                 int32_t dst_stride,
2022
                                 const int8_t *filter,
2023
                                 int32_t height)
2024
{
2025
    uint32_t loop_cnt;
2026
    v8i16 filt0, filt1;
2027
    v16i8 src0, src1, src2, src3;
2028
    v8i16 in0, in1, in2, in3;
2029
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2030
    v16i8 mask1;
2031
    v16i8 vec0, vec1;
2032
    v8i16 dst0, dst1, dst2, dst3;
2033
    v8i16 filter_vec, const_vec;
2034
 
2035
    src0_ptr -= 1;
2036
 
2037
    const_vec = __msa_ldi_h(128);
2038
    const_vec <<= 6;
2039
 
2040
    filter_vec = LD_SH(filter);
2041
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2042
 
2043
    mask1 = mask0 + 2;
2044
 
2045
    for (loop_cnt = (height >> 2); loop_cnt--;) {
2046
        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2047
        src0_ptr += (4 * src_stride);
2048
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2049
        src1_ptr += (4 * src2_stride);
2050
        XORI_B4_128_SB(src0, src1, src2, src3);
2051
 
2052
        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2053
        dst0 = const_vec;
2054
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2055
        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2056
        dst1 = const_vec;
2057
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2058
        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2059
        dst2 = const_vec;
2060
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2061
        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2062
        dst3 = const_vec;
2063
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2064
 
2065
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2066
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2067
 
2068
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2069
        ST6x4_UB(dst0, dst1, dst, dst_stride);
2070
        dst += (4 * dst_stride);
2071
    }
2072
}
2073
 
2074
static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
2075
                                  int32_t src_stride,
2076
                                  int16_t *src1_ptr,
2077
                                  int32_t src2_stride,
2078
                                  uint8_t *dst,
2079
                                  int32_t dst_stride,
2080
                                  const int8_t *filter,
2081
                                  int32_t height)
2082
{
2083
    v8i16 filt0, filt1;
2084
    v16i8 src0, src1;
2085
    v8i16 in0, in1;
2086
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2087
    v16i8 mask1, vec0, vec1;
2088
    v8i16 dst0, dst1;
2089
    v8i16 filter_vec, const_vec;
2090
 
2091
    src0_ptr -= 1;
2092
 
2093
    const_vec = __msa_ldi_h(128);
2094
    const_vec <<= 6;
2095
 
2096
    filter_vec = LD_SH(filter);
2097
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2098
 
2099
    mask1 = mask0 + 2;
2100
 
2101
    LD_SB2(src0_ptr, src_stride, src0, src1);
2102
    LD_SH2(src1_ptr, src2_stride, in0, in1);
2103
    XORI_B2_128_SB(src0, src1);
2104
 
2105
    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2106
    dst0 = const_vec;
2107
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2108
    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2109
    dst1 = const_vec;
2110
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2111
    HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
2112
 
2113
    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2114
    ST8x2_UB(dst0, dst, dst_stride);
2115
}
2116
 
2117
static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
2118
                                  int32_t src_stride,
2119
                                  int16_t *src1_ptr,
2120
                                  int32_t src2_stride,
2121
                                  uint8_t *dst,
2122
                                  int32_t dst_stride,
2123
                                  const int8_t *filter,
2124
                                  int32_t height)
2125
{
2126
    v8i16 filt0, filt1;
2127
    v16i8 src0, src1, src2, src3, src4, src5;
2128
    v8i16 in0, in1, in2, in3, in4, in5;
2129
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2130
    v16i8 mask1;
2131
    v16i8 vec0, vec1;
2132
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2133
    v8i16 filter_vec, const_vec;
2134
 
2135
    src0_ptr -= 1;
2136
 
2137
    const_vec = __msa_ldi_h(128);
2138
    const_vec <<= 6;
2139
 
2140
    filter_vec = LD_SH(filter);
2141
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2142
 
2143
    mask1 = mask0 + 2;
2144
 
2145
    LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2146
    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2147
    src1_ptr += (4 * src2_stride);
2148
    LD_SH2(src1_ptr, src2_stride, in4, in5);
2149
    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2150
 
2151
    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2152
    dst0 = const_vec;
2153
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2154
    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2155
    dst1 = const_vec;
2156
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2157
    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2158
    dst2 = const_vec;
2159
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2160
    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2161
    dst3 = const_vec;
2162
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2163
    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2164
    dst4 = const_vec;
2165
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2166
    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2167
    dst5 = const_vec;
2168
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2169
 
2170
    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2171
                      dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2172
    HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2173
 
2174
    PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2175
    dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2176
    ST8x4_UB(dst0, dst1, dst, dst_stride);
2177
    dst += (4 * dst_stride);
2178
    ST8x2_UB(dst2, dst, dst_stride);
2179
}
2180
 
2181
static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
2182
                                          int32_t src_stride,
2183
                                          int16_t *src1_ptr,
2184
                                          int32_t src2_stride,
2185
                                          uint8_t *dst,
2186
                                          int32_t dst_stride,
2187
                                          const int8_t *filter,
2188
                                          int32_t height)
2189
{
2190
    uint32_t loop_cnt;
2191
    v8i16 filt0, filt1;
2192
    v16i8 src0, src1, src2, src3;
2193
    v8i16 in0, in1, in2, in3;
2194
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2195
    v16i8 mask1;
2196
    v16i8 vec0, vec1;
2197
    v8i16 dst0, dst1, dst2, dst3;
2198
    v8i16 filter_vec, const_vec;
2199
 
2200
    src0_ptr -= 1;
2201
 
2202
    const_vec = __msa_ldi_h(128);
2203
    const_vec <<= 6;
2204
 
2205
    filter_vec = LD_SH(filter);
2206
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2207
 
2208
    mask1 = mask0 + 2;
2209
 
2210
    for (loop_cnt = (height >> 2); loop_cnt--;) {
2211
        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2212
        src0_ptr += (4 * src_stride);
2213
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2214
        src1_ptr += (4 * src2_stride);
2215
        XORI_B4_128_SB(src0, src1, src2, src3);
2216
 
2217
        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2218
        dst0 = const_vec;
2219
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2220
        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2221
        dst1 = const_vec;
2222
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2223
        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2224
        dst2 = const_vec;
2225
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2226
        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2227
        dst3 = const_vec;
2228
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2229
 
2230
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2231
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2232
 
2233
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2234
        ST8x4_UB(dst0, dst1, dst, dst_stride);
2235
        dst += (4 * dst_stride);
2236
    }
2237
}
2238
 
2239
static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr,
2240
                                 int32_t src_stride,
2241
                                 int16_t *src1_ptr,
2242
                                 int32_t src2_stride,
2243
                                 uint8_t *dst,
2244
                                 int32_t dst_stride,
2245
                                 const int8_t *filter,
2246
                                 int32_t height)
2247
{
2248
    if (2 == height) {
2249
        hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2250
                              dst, dst_stride, filter, height);
2251
    } else if (6 == height) {
2252
        hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2253
                              dst, dst_stride, filter, height);
2254
    } else if (0 == (height % 4)) {
2255
        hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
2256
                                      src1_ptr, src2_stride,
2257
                                      dst, dst_stride, filter, height);
2258
    }
2259
}
2260
 
2261
static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
2262
                                  int32_t src_stride,
2263
                                  int16_t *src1_ptr,
2264
                                  int32_t src2_stride,
2265
                                  uint8_t *dst,
2266
                                  int32_t dst_stride,
2267
                                  const int8_t *filter,
2268
                                  int32_t height)
2269
{
2270
    uint32_t loop_cnt;
2271
    v8i16 filt0, filt1;
2272
    v16i8 src0, src1, src2, src3;
2273
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2274
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2275
    v16i8 mask2 = {
2276
        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2277
    };
2278
    v16i8 mask1, mask3;
2279
    v16i8 vec0, vec1;
2280
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2281
    v8i16 filter_vec, const_vec;
2282
 
2283
    src0_ptr -= 1;
2284
 
2285
    const_vec = __msa_ldi_h(128);
2286
    const_vec <<= 6;
2287
 
2288
    filter_vec = LD_SH(filter);
2289
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2290
 
2291
    mask1 = mask0 + 2;
2292
    mask3 = mask2 + 2;
2293
 
2294
    for (loop_cnt = (height >> 2); loop_cnt--;) {
2295
        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2296
        src0_ptr += (4 * src_stride);
2297
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2298
        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2299
        src1_ptr += (4 * src2_stride);
2300
 
2301
        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
2302
        XORI_B4_128_SB(src0, src1, src2, src3);
2303
 
2304
        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2305
        dst0 = const_vec;
2306
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2307
        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2308
        dst1 = const_vec;
2309
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2310
        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2311
        dst2 = const_vec;
2312
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2313
        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2314
        dst3 = const_vec;
2315
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2316
        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2317
        dst4 = const_vec;
2318
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2319
        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2320
        dst5 = const_vec;
2321
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2322
 
2323
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2324
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2325
        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2326
 
2327
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2328
        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2329
        ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
2330
        dst += (4 * dst_stride);
2331
    }
2332
}
2333
 
2334
static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr,
2335
                                  int32_t src_stride,
2336
                                  int16_t *src1_ptr,
2337
                                  int32_t src2_stride,
2338
                                  uint8_t *dst,
2339
                                  int32_t dst_stride,
2340
                                  const int8_t *filter,
2341
                                  int32_t height)
2342
{
2343
    uint32_t loop_cnt;
2344
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2345
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2346
    v8i16 filt0, filt1;
2347
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2348
    v16i8 mask1;
2349
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2350
    v16i8 vec0, vec1;
2351
    v8i16 filter_vec, const_vec;
2352
 
2353
    src0_ptr -= 1;
2354
 
2355
    const_vec = __msa_ldi_h(128);
2356
    const_vec <<= 6;
2357
 
2358
    filter_vec = LD_SH(filter);
2359
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2360
 
2361
    mask1 = mask0 + 2;
2362
 
2363
    for (loop_cnt = (height >> 2); loop_cnt--;) {
2364
        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2365
        LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
2366
        src0_ptr += (4 * src_stride);
2367
        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2368
        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2369
        src1_ptr += (4 * src2_stride);
2370
        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2371
 
2372
        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2373
        dst0 = const_vec;
2374
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2375
        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2376
        dst1 = const_vec;
2377
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2378
        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2379
        dst2 = const_vec;
2380
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2381
        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2382
        dst3 = const_vec;
2383
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2384
        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2385
        dst4 = const_vec;
2386
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2387
        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2388
        dst5 = const_vec;
2389
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2390
        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2391
        dst6 = const_vec;
2392
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2393
        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2394
        dst7 = const_vec;
2395
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2396
 
2397
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2398
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2399
        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2400
                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2401
 
2402
        PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2403
                    dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2404
        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2405
        dst += (4 * dst_stride);
2406
    }
2407
}
2408
 
2409
static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
2410
                                  int32_t src_stride,
2411
                                  int16_t *src1_ptr,
2412
                                  int32_t src2_stride,
2413
                                  uint8_t *dst,
2414
                                  int32_t dst_stride,
2415
                                  const int8_t *filter,
2416
                                  int32_t height)
2417
{
2418
    int16_t *src1_ptr_tmp;
2419
    uint8_t *dst_tmp;
2420
    uint32_t loop_cnt;
2421
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2422
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2423
    v8i16 filt0, filt1;
2424
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2425
    v16i8 mask1, mask2, mask3;
2426
    v16i8 vec0, vec1;
2427
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2428
    v8i16 filter_vec, const_vec;
2429
 
2430
    src0_ptr -= 1;
2431
 
2432
    const_vec = __msa_ldi_h(128);
2433
    const_vec <<= 6;
2434
 
2435
    filter_vec = LD_SH(filter);
2436
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2437
 
2438
    mask1 = mask0 + 2;
2439
    mask2 = mask0 + 8;
2440
    mask3 = mask0 + 10;
2441
 
2442
    dst_tmp = dst + 16;
2443
    src1_ptr_tmp = src1_ptr + 16;
2444
 
2445
    for (loop_cnt = (height >> 2); loop_cnt--;) {
2446
        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2447
        LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2448
        src0_ptr += (4 * src_stride);
2449
        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2450
        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2451
        src1_ptr += (4 * src2_stride);
2452
        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2453
 
2454
        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2455
        dst0 = const_vec;
2456
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2457
        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2458
        dst1 = const_vec;
2459
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2460
        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2461
        dst2 = const_vec;
2462
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2463
        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2464
        dst3 = const_vec;
2465
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2466
        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2467
        dst4 = const_vec;
2468
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2469
        VSHF_B2_SB(src4, src5, src4, src5, mask2, mask3, vec0, vec1);
2470
        dst5 = const_vec;
2471
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2472
        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2473
        dst6 = const_vec;
2474
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2475
        VSHF_B2_SB(src6, src7, src6, src7, mask2, mask3, vec0, vec1);
2476
        dst7 = const_vec;
2477
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2478
 
2479
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2480
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2481
        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2482
                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2483
 
2484
        PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2485
                    dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2486
        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2487
        dst += (4 * dst_stride);
2488
 
2489
        LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2490
        src1_ptr_tmp += (4 * src2_stride);
2491
 
2492
        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2493
        dst0 = const_vec;
2494
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2495
        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2496
        dst1 = const_vec;
2497
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2498
        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2499
        dst2 = const_vec;
2500
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2501
        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2502
        dst3 = const_vec;
2503
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2504
 
2505
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2506
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2507
 
2508
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2509
        ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
2510
        dst_tmp += (4 * dst_stride);
2511
    }
2512
}
2513
 
2514
static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr,
2515
                                  int32_t src_stride,
2516
                                  int16_t *src1_ptr,
2517
                                  int32_t src2_stride,
2518
                                  uint8_t *dst,
2519
                                  int32_t dst_stride,
2520
                                  const int8_t *filter,
2521
                                  int32_t height)
2522
{
2523
    uint32_t loop_cnt;
2524
    v16i8 src0, src1, src2;
2525
    v8i16 in0, in1, in2, in3;
2526
    v8i16 filt0, filt1;
2527
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2528
    v16i8 mask1, mask2, mask3;
2529
    v8i16 dst0, dst1, dst2, dst3;
2530
    v16i8 vec0, vec1;
2531
    v8i16 filter_vec, const_vec;
2532
 
2533
    src0_ptr -= 1;
2534
 
2535
    const_vec = __msa_ldi_h(128);
2536
    const_vec <<= 6;
2537
 
2538
    filter_vec = LD_SH(filter);
2539
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2540
 
2541
    mask1 = mask0 + 2;
2542
    mask2 = mask0 + 8;
2543
    mask3 = mask0 + 10;
2544
 
2545
    for (loop_cnt = (height >> 1); loop_cnt--;) {
2546
        LD_SB2(src0_ptr, 16, src0, src1);
2547
        src2 = LD_SB(src0_ptr + 24);
2548
        src0_ptr += src_stride;
2549
        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2550
        src1_ptr += src2_stride;
2551
        XORI_B3_128_SB(src0, src1, src2);
2552
 
2553
        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2554
        dst0 = const_vec;
2555
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2556
        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2557
        dst1 = const_vec;
2558
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2559
        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2560
        dst2 = const_vec;
2561
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2562
        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2563
        dst3 = const_vec;
2564
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2565
 
2566
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2567
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2568
 
2569
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2570
        ST_SH2(dst0, dst1, dst, 16);
2571
        dst += dst_stride;
2572
 
2573
        LD_SB2(src0_ptr, 16, src0, src1);
2574
        src2 = LD_SB(src0_ptr + 24);
2575
        src0_ptr += src_stride;
2576
        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2577
        src1_ptr += src2_stride;
2578
        XORI_B3_128_SB(src0, src1, src2);
2579
 
2580
        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2581
        dst0 = const_vec;
2582
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2583
        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2584
        dst1 = const_vec;
2585
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2586
        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2587
        dst2 = const_vec;
2588
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2589
        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2590
        dst3 = const_vec;
2591
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2592
 
2593
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2594
                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2595
 
2596
        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2597
        ST_SH2(dst0, dst1, dst, 16);
2598
        dst += dst_stride;
2599
    }
2600
}
2601
 
2602
static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
2603
                                  int32_t src_stride,
2604
                                  int16_t *src1_ptr,
2605
                                  int32_t src2_stride,
2606
                                  uint8_t *dst,
2607
                                  int32_t dst_stride,
2608
                                  const int8_t *filter,
2609
                                  int32_t height)
2610
{
2611
    v16i8 src0, src1, src2, src3, src4;
2612
    v8i16 in0, in1;
2613
    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2614
    v8i16 dst10;
2615
    v8i16 filt0, filt1;
2616
    v8i16 filter_vec, const_vec;
2617
 
2618
    src0_ptr -= src_stride;
2619
 
2620
    const_vec = __msa_ldi_h(128);
2621
    const_vec <<= 6;
2622
 
2623
    filter_vec = LD_SH(filter);
2624
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2625
 
2626
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2627
    src0_ptr += (3 * src_stride);
2628
 
2629
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2630
    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2631
    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2632
 
2633
    LD_SB2(src0_ptr, src_stride, src3, src4);
2634
    LD_SH2(src1_ptr, src2_stride, in0, in1);
2635
    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2636
    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2637
    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2638
    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2639
 
2640
    dst10 = const_vec;
2641
    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2642
    dst10 = __msa_adds_s_h(dst10, in0);
2643
    dst10 = __msa_srari_h(dst10, 7);
2644
    dst10 = CLIP_SH_0_255(dst10);
2645
 
2646
    dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2647
    ST4x2_UB(dst10, dst, dst_stride);
2648
}
2649
 
2650
static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
2651
                                  int32_t src_stride,
2652
                                  int16_t *src1_ptr,
2653
                                  int32_t src2_stride,
2654
                                  uint8_t *dst,
2655
                                  int32_t dst_stride,
2656
                                  const int8_t *filter,
2657
                                  int32_t height)
2658
{
2659
    v16i8 src0, src1, src2, src3, src4, src5, src6;
2660
    v8i16 in0, in1, in2, in3;
2661
    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2662
    v16i8 src2110, src4332, src6554;
2663
    v8i16 dst10, dst32;
2664
    v8i16 filt0, filt1;
2665
    v8i16 filter_vec, const_vec;
2666
 
2667
    src0_ptr -= src_stride;
2668
 
2669
    const_vec = __msa_ldi_h(128);
2670
    const_vec <<= 6;
2671
 
2672
    filter_vec = LD_SH(filter);
2673
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2674
 
2675
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2676
    src0_ptr += (3 * src_stride);
2677
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2678
    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2679
    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2680
 
2681
    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
2682
    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2683
    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2684
    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2685
               src32_r, src43_r, src54_r, src65_r);
2686
    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2687
    XORI_B2_128_SB(src4332, src6554);
2688
 
2689
    dst10 = const_vec;
2690
    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2691
    dst32 = const_vec;
2692
    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2693
    HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
2694
 
2695
    dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
2696
    ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
2697
}
2698
 
2699
static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
2700
                                          int32_t src_stride,
2701
                                          int16_t *src1_ptr,
2702
                                          int32_t src2_stride,
2703
                                          uint8_t *dst,
2704
                                          int32_t dst_stride,
2705
                                          const int8_t *filter,
2706
                                          int32_t height)
2707
{
2708
    int32_t loop_cnt;
2709
    v16i8 src0, src1, src2, src3, src4, src5;
2710
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2711
    v16i8 src6, src7, src8, src9;
2712
    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2713
    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2714
    v16i8 src2110, src4332, src6554, src8776;
2715
    v8i16 dst10, dst32, dst54, dst76;
2716
    v8i16 filt0, filt1;
2717
    v8i16 filter_vec, const_vec;
2718
 
2719
    src0_ptr -= src_stride;
2720
 
2721
    const_vec = __msa_ldi_h(128);
2722
    const_vec <<= 6;
2723
 
2724
    filter_vec = LD_SH(filter);
2725
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2726
 
2727
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2728
    src0_ptr += (3 * src_stride);
2729
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2730
    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2731
    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2732
 
2733
    for (loop_cnt = (height >> 3); loop_cnt--;) {
2734
        LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
2735
        src0_ptr += (6 * src_stride);
2736
        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
2737
        src1_ptr += (8 * src2_stride);
2738
        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2739
        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2740
        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2741
                   src32_r, src43_r, src54_r, src65_r);
2742
        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2743
        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
2744
                   src4332, src6554, src8776);
2745
        XORI_B3_128_SB(src4332, src6554, src8776);
2746
 
2747
        dst10 = const_vec;
2748
        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2749
        dst32 = const_vec;
2750
        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2751
        dst54 = const_vec;
2752
        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2753
 
2754
        LD_SB2(src0_ptr, src_stride, src9, src2);
2755
        src0_ptr += (2 * src_stride);
2756
        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
2757
        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
2758
        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2759
        dst76 = const_vec;
2760
        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
2761
 
2762
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2763
                          dst10, dst32, dst54, dst76, 7,
2764
                          dst10, dst32, dst54, dst76);
2765
 
2766
        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
2767
        ST4x8_UB(dst10, dst54, dst, dst_stride);
2768
        dst += (8 * dst_stride);
2769
    }
2770
}
2771
 
2772
static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr,
2773
                                 int32_t src_stride,
2774
                                 int16_t *src1_ptr,
2775
                                 int32_t src2_stride,
2776
                                 uint8_t *dst,
2777
                                 int32_t dst_stride,
2778
                                 const int8_t *filter,
2779
                                 int32_t height)
2780
{
2781
    if (2 == height) {
2782
        hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2783
                              dst, dst_stride, filter, height);
2784
    } else if (4 == height) {
2785
        hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2786
                              dst, dst_stride, filter, height);
2787
    } else {
2788
        hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
2789
                                      src1_ptr, src2_stride,
2790
                                      dst, dst_stride, filter, height);
2791
    }
2792
}
2793
 
2794
static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
2795
                                 int32_t src_stride,
2796
                                 int16_t *src1_ptr,
2797
                                 int32_t src2_stride,
2798
                                 uint8_t *dst,
2799
                                 int32_t dst_stride,
2800
                                 const int8_t *filter,
2801
                                 int32_t height)
2802
{
2803
    int32_t loop_cnt;
2804
    v16i8 src0, src1, src2, src3, src4, src5;
2805
    v8i16 in0, in1, in2, in3;
2806
    v16i8 src10_r, src32_r, src21_r, src43_r;
2807
    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2808
    v8i16 filt0, filt1;
2809
    v8i16 filter_vec, const_vec;
2810
 
2811
    src0_ptr -= src_stride;
2812
 
2813
    const_vec = __msa_ldi_h(128);
2814
    const_vec <<= 6;
2815
 
2816
    filter_vec = LD_SH(filter);
2817
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2818
 
2819
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2820
    src0_ptr += (3 * src_stride);
2821
    XORI_B3_128_SB(src0, src1, src2);
2822
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2823
 
2824
    for (loop_cnt = (height >> 2); loop_cnt--;) {
2825
        LD_SB2(src0_ptr, src_stride, src3, src4);
2826
        src0_ptr += (2 * src_stride);
2827
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2828
        src1_ptr += (4 * src2_stride);
2829
        XORI_B2_128_SB(src3, src4);
2830
        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2831
 
2832
        dst0_r = const_vec;
2833
        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2834
        dst1_r = const_vec;
2835
        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2836
 
2837
        LD_SB2(src0_ptr, src_stride, src5, src2);
2838
        src0_ptr += (2 * src_stride);
2839
        XORI_B2_128_SB(src5, src2);
2840
        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2841
 
2842
        dst2_r = const_vec;
2843
        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2844
        dst3_r = const_vec;
2845
        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2846
 
2847
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2848
                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
2849
                          dst0_r, dst1_r, dst2_r, dst3_r);
2850
 
2851
        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2852
        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
2853
        dst += (4 * dst_stride);
2854
    }
2855
}
2856
 
2857
static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
2858
                                  int32_t src_stride,
2859
                                  int16_t *src1_ptr,
2860
                                  int32_t src2_stride,
2861
                                  uint8_t *dst,
2862
                                  int32_t dst_stride,
2863
                                  const int8_t *filter,
2864
                                  int32_t height)
2865
{
2866
    v16i8 src0, src1, src2, src3, src4;
2867
    v8i16 in0, in1, dst0_r, dst1_r;
2868
    v16i8 src10_r, src32_r, src21_r, src43_r;
2869
    v8i16 filt0, filt1;
2870
    v8i16 filter_vec, const_vec;
2871
 
2872
    src0_ptr -= src_stride;
2873
 
2874
    const_vec = __msa_ldi_h(128);
2875
    const_vec <<= 6;
2876
 
2877
    filter_vec = LD_SH(filter);
2878
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2879
 
2880
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2881
    src0_ptr += (3 * src_stride);
2882
    XORI_B3_128_SB(src0, src1, src2);
2883
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2884
 
2885
    LD_SB2(src0_ptr, src_stride, src3, src4);
2886
    LD_SH2(src1_ptr, src2_stride, in0, in1);
2887
    XORI_B2_128_SB(src3, src4);
2888
    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2889
 
2890
    dst0_r = const_vec;
2891
    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2892
    dst1_r = const_vec;
2893
    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2894
 
2895
    HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
2896
    dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2897
 
2898
    ST8x2_UB(dst0_r, dst, dst_stride);
2899
}
2900
 
2901
static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
2902
                                  int32_t src_stride,
2903
                                  int16_t *src1_ptr,
2904
                                  int32_t src2_stride,
2905
                                  uint8_t *dst,
2906
                                  int32_t dst_stride,
2907
                                  const int8_t *filter,
2908
                                  int32_t height)
2909
{
2910
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2911
    v8i16 in0, in1, in2, in3, in4, in5;
2912
    v16i8 src10_r, src32_r, src54_r, src76_r;
2913
    v16i8 src21_r, src43_r, src65_r, src87_r;
2914
    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2915
    v8i16 filt0, filt1;
2916
    v8i16 filter_vec, const_vec;
2917
 
2918
    src0_ptr -= src_stride;
2919
 
2920
    const_vec = __msa_ldi_h(128);
2921
    const_vec <<= 6;
2922
 
2923
    filter_vec = LD_SH(filter);
2924
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2925
 
2926
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2927
    src0_ptr += (3 * src_stride);
2928
    XORI_B3_128_SB(src0, src1, src2);
2929
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2930
 
2931
    LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
2932
    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
2933
    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
2934
    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2935
               src32_r, src43_r, src54_r, src65_r);
2936
    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2937
 
2938
    dst0_r = const_vec;
2939
    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2940
    dst1_r = const_vec;
2941
    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2942
    dst2_r = const_vec;
2943
    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2944
    dst3_r = const_vec;
2945
    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2946
    dst4_r = const_vec;
2947
    DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
2948
    dst5_r = const_vec;
2949
    DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
2950
    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2951
                      dst0_r, dst1_r, dst2_r, dst3_r, 7,
2952
                      dst0_r, dst1_r, dst2_r, dst3_r);
2953
    HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
2954
 
2955
    PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2956
    dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
2957
    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2958
    dst += (4 * dst_stride);
2959
    ST8x2_UB(dst2_r, dst, dst_stride);
2960
}
2961
 
2962
static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
2963
                                          int32_t src_stride,
2964
                                          int16_t *src1_ptr,
2965
                                          int32_t src2_stride,
2966
                                          uint8_t *dst,
2967
                                          int32_t dst_stride,
2968
                                          const int8_t *filter,
2969
                                          int32_t height)
2970
{
2971
    int32_t loop_cnt;
2972
    v16i8 src0, src1, src2, src3, src4, src5;
2973
    v8i16 in0, in1, in2, in3;
2974
    v16i8 src10_r, src32_r, src21_r, src43_r;
2975
    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2976
    v8i16 filt0, filt1;
2977
    v8i16 filter_vec, const_vec;
2978
 
2979
    src0_ptr -= src_stride;
2980
 
2981
    const_vec = __msa_ldi_h(128);
2982
    const_vec <<= 6;
2983
 
2984
    filter_vec = LD_SH(filter);
2985
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2986
 
2987
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2988
    src0_ptr += (3 * src_stride);
2989
    XORI_B3_128_SB(src0, src1, src2);
2990
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2991
 
2992
    for (loop_cnt = (height >> 2); loop_cnt--;) {
2993
        LD_SB2(src0_ptr, src_stride, src3, src4);
2994
        src0_ptr += (2 * src_stride);
2995
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2996
        src1_ptr += (4 * src2_stride);
2997
        XORI_B2_128_SB(src3, src4);
2998
        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2999
 
3000
        dst0_r = const_vec;
3001
        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3002
        dst1_r = const_vec;
3003
        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3004
 
3005
        LD_SB2(src0_ptr, src_stride, src5, src2);
3006
        src0_ptr += (2 * src_stride);
3007
        XORI_B2_128_SB(src5, src2);
3008
        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3009
 
3010
        dst2_r = const_vec;
3011
        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3012
        dst3_r = const_vec;
3013
        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3014
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3015
                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
3016
                          dst0_r, dst1_r, dst2_r, dst3_r);
3017
 
3018
        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3019
        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3020
        dst += (4 * dst_stride);
3021
    }
3022
}
3023
 
3024
static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr,
3025
                                 int32_t src_stride,
3026
                                 int16_t *src1_ptr,
3027
                                 int32_t src2_stride,
3028
                                 uint8_t *dst,
3029
                                 int32_t dst_stride,
3030
                                 const int8_t *filter,
3031
                                 int32_t height)
3032
{
3033
    if (2 == height) {
3034
        hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3035
                              dst, dst_stride, filter, height);
3036
    } else if (6 == height) {
3037
        hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3038
                              dst, dst_stride, filter, height);
3039
    } else {
3040
        hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
3041
                                      src1_ptr, src2_stride,
3042
                                      dst, dst_stride, filter, height);
3043
    }
3044
}
3045
 
3046
static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
3047
                                  int32_t src_stride,
3048
                                  int16_t *src1_ptr,
3049
                                  int32_t src2_stride,
3050
                                  uint8_t *dst,
3051
                                  int32_t dst_stride,
3052
                                  const int8_t *filter,
3053
                                  int32_t height)
3054
{
3055
    int32_t loop_cnt;
3056
    v16i8 src0, src1, src2, src3, src4, src5;
3057
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3058
    v16i8 src10_r, src32_r, src21_r, src43_r;
3059
    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3060
    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3061
    v16i8 src2110, src4332;
3062
    v8i16 dst0_l, dst1_l, filt0, filt1;
3063
    v8i16 filter_vec, const_vec;
3064
 
3065
    src0_ptr -= (1 * src_stride);
3066
 
3067
    const_vec = __msa_ldi_h(128);
3068
    const_vec <<= 6;
3069
 
3070
    filter_vec = LD_SH(filter);
3071
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3072
 
3073
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3074
    src0_ptr += (3 * src_stride);
3075
    XORI_B3_128_SB(src0, src1, src2);
3076
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3077
    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3078
    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3079
 
3080
    for (loop_cnt = (height >> 2); loop_cnt--;) {
3081
        LD_SB2(src0_ptr, src_stride, src3, src4);
3082
        src0_ptr += (2 * src_stride);
3083
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3084
        LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3085
        src1_ptr += (4 * src2_stride);
3086
        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3087
        XORI_B2_128_SB(src3, src4);
3088
 
3089
        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3090
        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3091
        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3092
 
3093
        dst0_r = const_vec;
3094
        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3095
        dst1_r = const_vec;
3096
        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3097
        dst0_l = const_vec;
3098
        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3099
 
3100
        LD_SB2(src0_ptr, src_stride, src5, src2);
3101
        src0_ptr += (2 * src_stride);
3102
        XORI_B2_128_SB(src5, src2);
3103
 
3104
        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3105
        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
3106
        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3107
 
3108
        dst2_r = const_vec;
3109
        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3110
        dst3_r = const_vec;
3111
        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3112
        dst1_l = const_vec;
3113
        DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
3114
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3115
                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
3116
                          dst0_r, dst1_r, dst2_r, dst3_r);
3117
        HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
3118
 
3119
        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3120
        dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3121
        ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
3122
        dst += (4 * dst_stride);
3123
    }
3124
}
3125
 
3126
static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr,
3127
                                  int32_t src_stride,
3128
                                  int16_t *src1_ptr,
3129
                                  int32_t src2_stride,
3130
                                  uint8_t *dst,
3131
                                  int32_t dst_stride,
3132
                                  const int8_t *filter,
3133
                                  int32_t height)
3134
{
3135
    int32_t loop_cnt;
3136
    v16i8 src0, src1, src2, src3, src4, src5;
3137
    v8i16 in0, in1, in2, in3;
3138
    v16i8 src10_r, src32_r, src21_r, src43_r;
3139
    v16i8 src10_l, src32_l, src21_l, src43_l;
3140
    v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3141
    v8i16 filt0, filt1;
3142
    v8i16 filter_vec, const_vec;
3143
 
3144
    src0_ptr -= src_stride;
3145
 
3146
    const_vec = __msa_ldi_h(128);
3147
    const_vec <<= 6;
3148
 
3149
    filter_vec = LD_SH(filter);
3150
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3151
 
3152
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3153
    src0_ptr += (3 * src_stride);
3154
    XORI_B3_128_SB(src0, src1, src2);
3155
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3156
    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3157
 
3158
    for (loop_cnt = (height >> 2); loop_cnt--;) {
3159
        LD_SB2(src0_ptr, src_stride, src3, src4);
3160
        src0_ptr += (2 * src_stride);
3161
        LD_SH2(src1_ptr, src2_stride, in0, in1);
3162
        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3163
        src1_ptr += (2 * src2_stride);
3164
        XORI_B2_128_SB(src3, src4);
3165
        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3166
        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3167
 
3168
        dst0_r = const_vec;
3169
        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3170
        dst1_r = const_vec;
3171
        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3172
        dst0_l = const_vec;
3173
        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3174
        dst1_l = const_vec;
3175
        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3176
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3177
                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
3178
                          dst0_r, dst1_r, dst0_l, dst1_l);
3179
 
3180
        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3181
        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3182
        dst += (2 * dst_stride);
3183
 
3184
        LD_SB2(src0_ptr, src_stride, src5, src2);
3185
        src0_ptr += (2 * src_stride);
3186
        LD_SH2(src1_ptr, src2_stride, in0, in1);
3187
        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3188
        src1_ptr += (2 * src2_stride);
3189
        XORI_B2_128_SB(src5, src2);
3190
        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3191
        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3192
 
3193
        dst0_r = const_vec;
3194
        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3195
        dst0_l = const_vec;
3196
        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3197
        dst1_r = const_vec;
3198
        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3199
        dst1_l = const_vec;
3200
        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3201
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3202
                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
3203
                          dst0_r, dst1_r, dst0_l, dst1_l);
3204
 
3205
        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3206
        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3207
        dst += (2 * dst_stride);
3208
    }
3209
}
3210
 
3211
static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
3212
                                  int32_t src_stride,
3213
                                  int16_t *src1_ptr,
3214
                                  int32_t src2_stride,
3215
                                  uint8_t *dst,
3216
                                  int32_t dst_stride,
3217
                                  const int8_t *filter,
3218
                                  int32_t height)
3219
{
3220
    uint32_t loop_cnt;
3221
    v16i8 src0, src1, src2, src3, src4, src5;
3222
    v16i8 src6, src7, src8, src9, src10, src11;
3223
    v8i16 in0, in1, in2, in3, in4, in5;
3224
    v16i8 src10_r, src32_r, src76_r, src98_r;
3225
    v16i8 src21_r, src43_r, src87_r, src109_r;
3226
    v16i8 src10_l, src32_l, src21_l, src43_l;
3227
    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3228
    v8i16 dst0_l, dst1_l;
3229
    v8i16 filt0, filt1;
3230
    v8i16 filter_vec, const_vec;
3231
 
3232
    src0_ptr -= src_stride;
3233
 
3234
    const_vec = __msa_ldi_h(128);
3235
    const_vec <<= 6;
3236
 
3237
    filter_vec = LD_SH(filter);
3238
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3239
 
3240
    /* 16width */
3241
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3242
    XORI_B3_128_SB(src0, src1, src2);
3243
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3244
    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3245
    /* 8width */
3246
    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3247
    src0_ptr += (3 * src_stride);
3248
    XORI_B3_128_SB(src6, src7, src8);
3249
    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3250
 
3251
    for (loop_cnt = (height >> 2); loop_cnt--;) {
3252
        /* 16width */
3253
        LD_SB2(src0_ptr, src_stride, src3, src4);
3254
        LD_SH2(src1_ptr, src2_stride, in0, in1);
3255
        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3256
        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3257
        src1_ptr += (2 * src2_stride);
3258
        XORI_B2_128_SB(src3, src4);
3259
        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3260
        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3261
        /* 8width */
3262
        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3263
        src0_ptr += (2 * src_stride);
3264
        XORI_B2_128_SB(src9, src10);
3265
        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3266
        /* 16width */
3267
        dst0_r = const_vec;
3268
        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3269
        dst0_l = const_vec;
3270
        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3271
        dst1_r = const_vec;
3272
        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3273
        dst1_l = const_vec;
3274
        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3275
        /* 8width */
3276
        dst2_r = const_vec;
3277
        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3278
        dst3_r = const_vec;
3279
        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3280
        /* 16width */
3281
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3282
                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
3283
                          dst0_r, dst1_r, dst0_l, dst1_l);
3284
 
3285
        HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3286
 
3287
        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3288
        dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3289
        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3290
        ST8x2_UB(dst2_r, dst + 16, dst_stride);
3291
        dst += (2 * dst_stride);
3292
 
3293
        /* 16width */
3294
        LD_SB2(src0_ptr, src_stride, src5, src2);
3295
        LD_SH2(src1_ptr, src2_stride, in0, in1);
3296
        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3297
        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3298
        src1_ptr += (2 * src2_stride);
3299
        XORI_B2_128_SB(src5, src2);
3300
        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3301
        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3302
        /* 8width */
3303
        LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3304
        src0_ptr += (2 * src_stride);
3305
        XORI_B2_128_SB(src11, src8);
3306
        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3307
        /* 16width */
3308
        dst0_r = const_vec;
3309
        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3310
        dst0_l = const_vec;
3311
        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3312
        dst1_r = const_vec;
3313
        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3314
        dst1_l = const_vec;
3315
        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3316
        /* 8width */
3317
        dst2_r = const_vec;
3318
        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3319
        dst3_r = const_vec;
3320
        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3321
 
3322
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3323
                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
3324
                          dst0_r, dst1_r, dst0_l, dst1_l);
3325
        HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3326
 
3327
        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3328
        dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3329
        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3330
        ST8x2_UB(dst2_r, dst + 16, dst_stride);
3331
        dst += (2 * dst_stride);
3332
    }
3333
}
3334
 
3335
static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr,
3336
                                  int32_t src_stride,
3337
                                  int16_t *src1_ptr,
3338
                                  int32_t src2_stride,
3339
                                  uint8_t *dst,
3340
                                  int32_t dst_stride,
3341
                                  const int8_t *filter,
3342
                                  int32_t height)
3343
{
3344
    uint32_t loop_cnt;
3345
    uint8_t *dst_tmp = dst + 16;
3346
    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3347
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3348
    v16i8 src10_r, src32_r, src76_r, src98_r;
3349
    v16i8 src21_r, src43_r, src87_r, src109_r;
3350
    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3351
    v16i8 src10_l, src32_l, src76_l, src98_l;
3352
    v16i8 src21_l, src43_l, src87_l, src109_l;
3353
    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3354
    v8i16 filt0, filt1;
3355
    v8i16 filter_vec, const_vec;
3356
 
3357
    src0_ptr -= src_stride;
3358
 
3359
    const_vec = __msa_ldi_h(128);
3360
    const_vec <<= 6;
3361
 
3362
    filter_vec = LD_SH(filter);
3363
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3364
 
3365
    /* 16width */
3366
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3367
    XORI_B3_128_SB(src0, src1, src2);
3368
    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3369
    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3370
 
3371
    /* next 16width */
3372
    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3373
    src0_ptr += (3 * src_stride);
3374
    XORI_B3_128_SB(src6, src7, src8);
3375
    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3376
    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3377
 
3378
    for (loop_cnt = (height >> 1); loop_cnt--;) {
3379
        /* 16width */
3380
        LD_SB2(src0_ptr, src_stride, src3, src4);
3381
        LD_SH2(src1_ptr, src2_stride, in0, in1);
3382
        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3383
        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3384
        LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3385
        src1_ptr += (2 * src2_stride);
3386
        XORI_B2_128_SB(src3, src4);
3387
        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3388
        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3389
        /* 16width */
3390
        dst0_r = const_vec;
3391
        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3392
        dst0_l = const_vec;
3393
        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3394
        dst1_r = const_vec;
3395
        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3396
        dst1_l = const_vec;
3397
        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3398
        /* 16width */
3399
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3400
                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
3401
                          dst0_r, dst1_r, dst0_l, dst1_l);
3402
 
3403
        src10_r = src32_r;
3404
        src21_r = src43_r;
3405
        src10_l = src32_l;
3406
        src21_l = src43_l;
3407
        src2 = src4;
3408
 
3409
        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3410
        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3411
        dst += (2 * dst_stride);
3412
 
3413
        /* next 16width */
3414
        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3415
        src0_ptr += (2 * src_stride);
3416
        XORI_B2_128_SB(src9, src10);
3417
        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3418
        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3419
        /* next 16width */
3420
        dst2_r = const_vec;
3421
        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3422
        dst2_l = const_vec;
3423
        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3424
        dst3_r = const_vec;
3425
        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3426
        dst3_l = const_vec;
3427
        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3428
        /* next 16width */
3429
        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
3430
                          dst2_r, dst3_r, dst2_l, dst3_l, 7,
3431
                          dst2_r, dst3_r, dst2_l, dst3_l);
3432
 
3433
        PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3434
        ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3435
        dst_tmp += (2 * dst_stride);
3436
 
3437
        src76_r = src98_r;
3438
        src87_r = src109_r;
3439
        src76_l = src98_l;
3440
        src87_l = src109_l;
3441
        src8 = src10;
3442
    }
3443
}
3444
 
3445
static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
3446
                                  int32_t src_stride,
3447
                                  int16_t *src1_ptr,
3448
                                  int32_t src2_stride,
3449
                                  uint8_t *dst,
3450
                                  int32_t dst_stride,
3451
                                  const int8_t *filter_x,
3452
                                  const int8_t *filter_y,
3453
                                  int32_t height)
3454
{
3455
    v8i16 in0, in1;
3456
    v16i8 src0, src1, src2, src3, src4;
3457
    v8i16 filt0, filt1;
3458
    v4i32 filt_h0, filt_h1;
3459
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3460
    v16i8 mask1;
3461
    v8i16 filter_vec, const_vec;
3462
    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3463
    v8i16 dst0, dst1, dst2, dst3, dst4;
3464
    v4i32 dst0_r, dst1_r;
3465
    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3466
 
3467
    src0_ptr -= (src_stride + 1);
3468
 
3469
    filter_vec = LD_SH(filter_x);
3470
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3471
 
3472
    filter_vec = LD_SH(filter_y);
3473
    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3474
    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3475
 
3476
    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3477
 
3478
    mask1 = mask0 + 2;
3479
 
3480
    const_vec = __msa_ldi_h(128);
3481
    const_vec <<= 6;
3482
 
3483
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3484
    src0_ptr += (3 * src_stride);
3485
    XORI_B3_128_SB(src0, src1, src2);
3486
 
3487
    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3488
    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3489
    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3490
    dst0 = const_vec;
3491
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3492
    dst1 = const_vec;
3493
    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3494
    dst2 = const_vec;
3495
    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3496
    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3497
 
3498
    LD_SB2(src0_ptr, src_stride, src3, src4);
3499
    LD_SH2(src1_ptr, src2_stride, in0, in1);
3500
    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3501
    XORI_B2_128_SB(src3, src4);
3502
    /* row 3 */
3503
    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3504
    dst3 = const_vec;
3505
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3506
    dst32_r = __msa_ilvr_h(dst3, dst2);
3507
    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3508
    dst0_r >>= 6;
3509
    /* row 4 */
3510
    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3511
    dst4 = const_vec;
3512
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3513
    dst43_r = __msa_ilvr_h(dst4, dst3);
3514
    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3515
    dst1_r >>= 6;
3516
    dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3517
    dst0_r = (v4i32) __msa_adds_s_h((v8i16) dst0_r, in0);
3518
    dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 7);
3519
    dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
3520
 
3521
    dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
3522
    ST4x2_UB(dst0_r, dst, dst_stride);
3523
}
3524
 
3525
static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
3526
                                  int32_t src_stride,
3527
                                  int16_t *src1_ptr,
3528
                                  int32_t src2_stride,
3529
                                  uint8_t *dst,
3530
                                  int32_t dst_stride,
3531
                                  const int8_t *filter_x,
3532
                                  const int8_t *filter_y,
3533
                                  int32_t height)
3534
{
3535
    v8i16 in0, in1, in2, in3;
3536
    v16i8 src0, src1, src2, src3, src4, src5, src6;
3537
    v8i16 filt0, filt1;
3538
    v4i32 filt_h0, filt_h1;
3539
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3540
    v16i8 mask1;
3541
    v8i16 filter_vec, const_vec;
3542
    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3543
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3544
    v8i16 dst0_r, dst1_r;
3545
    v4i32 tmp0, tmp1, tmp2, tmp3;
3546
    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3547
 
3548
    src0_ptr -= (src_stride + 1);
3549
 
3550
    filter_vec = LD_SH(filter_x);
3551
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3552
 
3553
    filter_vec = LD_SH(filter_y);
3554
    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3555
    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3556
 
3557
    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3558
 
3559
    mask1 = mask0 + 2;
3560
 
3561
    const_vec = __msa_ldi_h(128);
3562
    const_vec <<= 6;
3563
 
3564
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3565
    src0_ptr += (3 * src_stride);
3566
    XORI_B3_128_SB(src0, src1, src2);
3567
 
3568
    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3569
    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3570
    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3571
    dst0 = const_vec;
3572
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3573
    dst1 = const_vec;
3574
    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3575
    dst2 = const_vec;
3576
    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3577
    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3578
 
3579
    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3580
    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3581
    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3582
    XORI_B4_128_SB(src3, src4, src5, src6);
3583
    /* row 3 */
3584
    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3585
    dst3 = const_vec;
3586
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3587
    dst32_r = __msa_ilvr_h(dst3, dst2);
3588
    tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3589
    tmp0 >>= 6;
3590
    /* row 4 */
3591
    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3592
    dst4 = const_vec;
3593
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3594
    dst43_r = __msa_ilvr_h(dst4, dst3);
3595
    tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3596
    tmp1 >>= 6;
3597
    /* row 5 */
3598
    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3599
    dst5 = const_vec;
3600
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3601
    dst10_r = __msa_ilvr_h(dst5, dst4);
3602
    tmp2 = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3603
    tmp2 >>= 6;
3604
    /* row 6 */
3605
    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3606
    dst2 = const_vec;
3607
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3608
    dst21_r = __msa_ilvr_h(dst2, dst5);
3609
    tmp3 = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3610
    tmp3 >>= 6;
3611
    PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
3612
    HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
3613
 
3614
    dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3615
    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
3616
    dst += (4 * dst_stride);
3617
}
3618
 
3619
static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
3620
                                          int32_t src_stride,
3621
                                          int16_t *src1_ptr,
3622
                                          int32_t src2_stride,
3623
                                          uint8_t *dst,
3624
                                          int32_t dst_stride,
3625
                                          const int8_t *filter_x,
3626
                                          const int8_t *filter_y,
3627
                                          int32_t height)
3628
{
3629
    uint32_t loop_cnt;
3630
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3631
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3632
    v8i16 filt0, filt1;
3633
    v4i32 filt_h0, filt_h1;
3634
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3635
    v16i8 mask1;
3636
    v8i16 filter_vec, const_vec;
3637
    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3638
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3639
    v8i16 tmp0, tmp1, tmp2, tmp3;
3640
    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3641
    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3642
    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3643
 
3644
    src0_ptr -= (src_stride + 1);
3645
 
3646
    filter_vec = LD_SH(filter_x);
3647
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3648
 
3649
    filter_vec = LD_SH(filter_y);
3650
    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3651
    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3652
 
3653
    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3654
 
3655
    mask1 = mask0 + 2;
3656
 
3657
    const_vec = __msa_ldi_h(128);
3658
    const_vec <<= 6;
3659
 
3660
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3661
    src0_ptr += (3 * src_stride);
3662
    XORI_B3_128_SB(src0, src1, src2);
3663
 
3664
    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3665
    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3666
    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3667
    dst0 = const_vec;
3668
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3669
    dst1 = const_vec;
3670
    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3671
    dst2 = const_vec;
3672
    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3673
    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3674
 
3675
    for (loop_cnt = height >> 3; loop_cnt--;) {
3676
        LD_SB8(src0_ptr, src_stride,
3677
               src3, src4, src5, src6, src7, src8, src9, src10);
3678
        src0_ptr += (8 * src_stride);
3679
        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3680
        src1_ptr += (8 * src2_stride);
3681
        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3682
        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3683
        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3684
        /* row 3 */
3685
        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3686
        dst3 = const_vec;
3687
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3688
        dst32_r = __msa_ilvr_h(dst3, dst2);
3689
        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3690
        dst0_r >>= 6;
3691
        /* row 4 */
3692
        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3693
        dst4 = const_vec;
3694
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3695
        dst43_r = __msa_ilvr_h(dst4, dst3);
3696
        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3697
        dst1_r >>= 6;
3698
        /* row 5 */
3699
        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3700
        dst5 = const_vec;
3701
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3702
        dst54_r = __msa_ilvr_h(dst5, dst4);
3703
        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3704
        dst2_r >>= 6;
3705
        /* row 6 */
3706
        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3707
        dst6 = const_vec;
3708
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3709
        dst65_r = __msa_ilvr_h(dst6, dst5);
3710
        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3711
        dst3_r >>= 6;
3712
        /* row 7 */
3713
        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3714
        dst7 = const_vec;
3715
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3716
        dst76_r = __msa_ilvr_h(dst7, dst6);
3717
        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3718
        dst4_r >>= 6;
3719
        /* row 8 */
3720
        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3721
        dst8 = const_vec;
3722
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3723
        dst87_r = __msa_ilvr_h(dst8, dst7);
3724
        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3725
        dst5_r >>= 6;
3726
        /* row 9 */
3727
        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3728
        dst9 = const_vec;
3729
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
3730
        dst10_r = __msa_ilvr_h(dst9, dst8);
3731
        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
3732
        dst6_r >>= 6;
3733
        /* row 10 */
3734
        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3735
        dst2 = const_vec;
3736
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3737
        dst21_r = __msa_ilvr_h(dst2, dst9);
3738
        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
3739
        dst7_r >>= 6;
3740
        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3741
                    dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
3742
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3743
                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
3744
 
3745
        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3746
        ST4x8_UB(tmp0, tmp1, dst, dst_stride);
3747
        dst += (8 * dst_stride);
3748
    }
3749
}
3750
 
3751
static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr,
3752
                                 int32_t src_stride,
3753
                                 int16_t *src1_ptr,
3754
                                 int32_t src2_stride,
3755
                                 uint8_t *dst,
3756
                                 int32_t dst_stride,
3757
                                 const int8_t *filter_x,
3758
                                 const int8_t *filter_y,
3759
                                 int32_t height)
3760
{
3761
    if (2 == height) {
3762
        hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3763
                              dst, dst_stride, filter_x, filter_y, height);
3764
    } else if (4 == height) {
3765
        hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3766
                              dst, dst_stride, filter_x, filter_y, height);
3767
    } else if (0 == (height % 8)) {
3768
        hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
3769
                                      src1_ptr, src2_stride,
3770
                                      dst, dst_stride,
3771
                                      filter_x, filter_y, height);
3772
    }
3773
}
3774
 
3775
static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
3776
                                 int32_t src_stride,
3777
                                 int16_t *src1_ptr,
3778
                                 int32_t src2_stride,
3779
                                 uint8_t *dst,
3780
                                 int32_t dst_stride,
3781
                                 const int8_t *filter_x,
3782
                                 const int8_t *filter_y,
3783
                                 int32_t height)
3784
{
3785
    uint32_t loop_cnt;
3786
    v16i8 src0, src1, src2, src3, src4, src5, src6;
3787
    v8i16 in0, in1, in2, in3;
3788
    v8i16 filt0, filt1;
3789
    v4i32 filt_h0, filt_h1;
3790
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3791
    v16i8 mask1;
3792
    v8i16 filter_vec, const_vec;
3793
    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3794
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3795
    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3796
    v8i16 tmp0, tmp1, tmp2, tmp3;
3797
    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3798
    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3799
 
3800
    src0_ptr -= (src_stride + 1);
3801
 
3802
    filter_vec = LD_SH(filter_x);
3803
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3804
 
3805
    filter_vec = LD_SH(filter_y);
3806
    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3807
    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3808
 
3809
    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3810
 
3811
    mask1 = mask0 + 2;
3812
 
3813
    const_vec = __msa_ldi_h(128);
3814
    const_vec <<= 6;
3815
 
3816
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3817
    src0_ptr += (3 * src_stride);
3818
    XORI_B3_128_SB(src0, src1, src2);
3819
 
3820
    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3821
    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3822
    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3823
    dst0 = const_vec;
3824
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3825
    dst1 = const_vec;
3826
    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3827
    dst2 = const_vec;
3828
    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3829
 
3830
    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3831
    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3832
 
3833
    for (loop_cnt = height >> 2; loop_cnt--;) {
3834
        LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3835
        src0_ptr += (4 * src_stride);
3836
        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3837
        src1_ptr += (4 * src2_stride);
3838
        XORI_B4_128_SB(src3, src4, src5, src6);
3839
 
3840
        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3841
        dst3 = const_vec;
3842
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3843
 
3844
        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3845
        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3846
        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3847
        dst0_r >>= 6;
3848
        dst0_l >>= 6;
3849
 
3850
        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3851
        dst4 = const_vec;
3852
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3853
 
3854
        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3855
        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3856
        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3857
        dst1_r >>= 6;
3858
        dst1_l >>= 6;
3859
 
3860
        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3861
        dst5 = const_vec;
3862
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3863
 
3864
        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3865
        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3866
        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3867
        dst2_r >>= 6;
3868
        dst2_l >>= 6;
3869
 
3870
        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3871
        dst2 = const_vec;
3872
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3873
 
3874
        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3875
        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3876
        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3877
        dst3_r >>= 6;
3878
        dst3_l >>= 6;
3879
        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3880
                    dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
3881
        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3882
                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
3883
 
3884
        PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
3885
        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3886
        dst += (4 * dst_stride);
3887
    }
3888
}
3889
 
3890
static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
3891
                                  int32_t src_stride,
3892
                                  int16_t *src1_ptr,
3893
                                  int32_t src2_stride,
3894
                                  uint8_t *dst,
3895
                                  int32_t dst_stride,
3896
                                  const int8_t *filter_x,
3897
                                  const int8_t *filter_y,
3898
                                  int32_t height)
3899
{
3900
    v16i8 src0, src1, src2, src3, src4;
3901
    v8i16 filt0, filt1;
3902
    v4i32 filt_h0, filt_h1;
3903
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3904
    v16i8 mask1;
3905
    v8i16 filter_vec, const_vec;
3906
    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3907
    v8i16 dst0, dst1, dst2, dst3, dst4;
3908
    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3909
    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3910
    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3911
    v8i16 tmp0, tmp1;
3912
    v8i16 in0, in1;
3913
 
3914
    src0_ptr -= (src_stride + 1);
3915
 
3916
    filter_vec = LD_SH(filter_x);
3917
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3918
 
3919
    filter_vec = LD_SH(filter_y);
3920
    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3921
    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3922
 
3923
    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3924
 
3925
    mask1 = mask0 + 2;
3926
 
3927
    const_vec = __msa_ldi_h(128);
3928
    const_vec <<= 6;
3929
 
3930
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3931
    src0_ptr += (3 * src_stride);
3932
    XORI_B3_128_SB(src0, src1, src2);
3933
 
3934
    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3935
    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3936
    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3937
    dst0 = const_vec;
3938
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3939
    dst1 = const_vec;
3940
    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3941
    dst2 = const_vec;
3942
    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3943
 
3944
    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3945
    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3946
 
3947
    LD_SB2(src0_ptr, src_stride, src3, src4);
3948
    LD_SH2(src1_ptr, src2_stride, in0, in1);
3949
    XORI_B2_128_SB(src3, src4);
3950
 
3951
    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3952
    dst3 = const_vec;
3953
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3954
 
3955
    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3956
    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3957
    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3958
    dst0_r >>= 6;
3959
    dst0_l >>= 6;
3960
 
3961
    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3962
    dst4 = const_vec;
3963
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3964
 
3965
    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3966
    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3967
    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3968
    dst1_r >>= 6;
3969
    dst1_l >>= 6;
3970
 
3971
    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
3972
    HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
3973
 
3974
    dst0_r = (v4i32) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3975
    ST8x2_UB(dst0_r, dst, dst_stride);
3976
}
3977
 
3978
static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
3979
                                  int32_t src_stride,
3980
                                  int16_t *src1_ptr,
3981
                                  int32_t src2_stride,
3982
                                  uint8_t *dst,
3983
                                  int32_t dst_stride,
3984
                                  const int8_t *filter_x,
3985
                                  const int8_t *filter_y,
3986
                                  int32_t height)
3987
{
3988
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3989
    v8i16 in0, in1, in2, in3, in4, in5;
3990
    v8i16 filt0, filt1;
3991
    v4i32 filt_h0, filt_h1;
3992
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3993
    v16i8 mask1;
3994
    v8i16 filter_vec, const_vec;
3995
    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3996
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3997
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3998
    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3999
    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4000
    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4001
    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4002
    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4003
    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4004
 
4005
    src0_ptr -= (src_stride + 1);
4006
 
4007
    filter_vec = LD_SH(filter_x);
4008
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4009
 
4010
    filter_vec = LD_SH(filter_y);
4011
    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4012
    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4013
 
4014
    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4015
 
4016
    mask1 = mask0 + 2;
4017
 
4018
    const_vec = __msa_ldi_h(128);
4019
    const_vec <<= 6;
4020
 
4021
    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4022
    src0_ptr += (3 * src_stride);
4023
    XORI_B3_128_SB(src0, src1, src2);
4024
    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4025
    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4026
    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4027
    dst0 = const_vec;
4028
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4029
    dst1 = const_vec;
4030
    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4031
    dst2 = const_vec;
4032
    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4033
 
4034
    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4035
    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4036
 
4037
    LD_SB2(src0_ptr, src_stride, src3, src4);
4038
    src0_ptr += (2 * src_stride);
4039
    XORI_B2_128_SB(src3, src4);
4040
    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4041
    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4042
    dst3 = const_vec;
4043
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4044
 
4045
    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4046
    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4047
    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4048
    dst0_r >>= 6;
4049
    dst0_l >>= 6;
4050
    tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
4051
 
4052
    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4053
    dst4 = const_vec;
4054
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4055
 
4056
    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4057
    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4058
    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4059
    dst1_r >>= 6;
4060
    dst1_l >>= 6;
4061
    tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
4062
 
4063
    LD_SB2(src0_ptr, src_stride, src5, src6);
4064
    src0_ptr += (2 * src_stride);
4065
    XORI_B2_128_SB(src5, src6);
4066
    /* row 5 */
4067
    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4068
    dst5 = const_vec;
4069
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4070
 
4071
    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4072
    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4073
    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4074
    dst2_r >>= 6;
4075
    dst2_l >>= 6;
4076
    tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
4077
 
4078
    /* row 6 */
4079
    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4080
    dst6 = const_vec;
4081
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
4082
 
4083
    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4084
    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4085
    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4086
    dst3_r >>= 6;
4087
    dst3_l >>= 6;
4088
    tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r);
4089
 
4090
    LD_SB2(src0_ptr, src_stride, src7, src8);
4091
    XORI_B2_128_SB(src7, src8);
4092
    /* row 7 */
4093
    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4094
    dst7 = const_vec;
4095
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
4096
 
4097
    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4098
    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4099
    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4100
 
4101
    dst4_r >>= 6;
4102
    dst4_l >>= 6;
4103
    tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r);
4104
    /* row 8 */
4105
    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4106
    dst8 = const_vec;
4107
    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
4108
 
4109
    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4110
    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4111
    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4112
    dst5_r >>= 6;
4113
    dst5_l >>= 6;
4114
    tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r);
4115
 
4116
    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
4117
                      tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
4118
    HEVC_BI_RND_CLIP2(in4, in5, tmp4, tmp5, 7, tmp4, tmp5);
4119
 
4120
    PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
4121
    dst2_r = (v4i32) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4122
    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
4123
    dst += (4 * dst_stride);
4124
    ST8x2_UB(dst2_r, dst, dst_stride);
4125
}
4126
 
4127
static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
4128
                                          int32_t src_stride,
4129
                                          int16_t *src1_ptr,
4130
                                          int32_t src2_stride,
4131
                                          uint8_t *dst,
4132
                                          int32_t dst_stride,
4133
                                          const int8_t *filter_x,
4134
                                          const int8_t *filter_y,
4135
                                          int32_t height,
4136
                                          int32_t width)
4137
{
4138
    uint32_t loop_cnt, cnt;
4139
    uint8_t *src0_ptr_tmp;
4140
    int16_t *src1_ptr_tmp;
4141
    uint8_t *dst_tmp;
4142
    v16i8 src0, src1, src2, src3, src4, src5, src6;
4143
    v8i16 in0, in1, in2, in3;
4144
    v8i16 filt0, filt1;
4145
    v4i32 filt_h0, filt_h1;
4146
    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4147
    v16i8 mask1;
4148
    v8i16 filter_vec, const_vec;
4149
    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4150
    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4151
    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4152
    v8i16 tmp0, tmp1, tmp2, tmp3;
4153
    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4154
    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4155
 
4156
    src0_ptr -= (src_stride + 1);
4157
 
4158
    filter_vec = LD_SH(filter_x);
4159
    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4160
 
4161
    filter_vec = LD_SH(filter_y);
4162
    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4163
    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4164
 
4165
    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4166
 
4167
    mask1 = mask0 + 2;
4168
 
4169
    const_vec = __msa_ldi_h(128);
4170
    const_vec <<= 6;
4171
 
4172
    for (cnt = width >> 3; cnt--;) {
4173
        src0_ptr_tmp = src0_ptr;
4174
        dst_tmp = dst;
4175
        src1_ptr_tmp = src1_ptr;
4176
 
4177
        LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4178
        src0_ptr_tmp += (3 * src_stride);
4179
        XORI_B3_128_SB(src0, src1, src2);
4180
 
4181
        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4182
        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4183
        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4184
        dst0 = const_vec;
4185
        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4186
        dst1 = const_vec;
4187
        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4188
        dst2 = const_vec;
4189
        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4190
 
4191
        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4192
        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4193
 
4194
        for (loop_cnt = height >> 2; loop_cnt--;) {
4195
            LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4196
            src0_ptr_tmp += (4 * src_stride);
4197
            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4198
            src1_ptr_tmp += (4 * src2_stride);
4199
            XORI_B4_128_SB(src3, src4, src5, src6);
4200
 
4201
            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4202
            dst3 = const_vec;
4203
            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4204
 
4205
            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4206
            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4207
            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4208
            dst0_r >>= 6;
4209
            dst0_l >>= 6;
4210
 
4211
            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4212
            dst4 = const_vec;
4213
            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4214
 
4215
            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4216
            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4217
            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4218
            dst1_r >>= 6;
4219
            dst1_l >>= 6;
4220
 
4221
            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4222
            dst5 = const_vec;
4223
            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4224
 
4225
            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
4226
            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4227
            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
4228
            dst2_r >>= 6;
4229
            dst2_l >>= 6;
4230
 
4231
            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4232
            dst2 = const_vec;
4233
            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4234
 
4235
            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
4236
            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4237
            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
4238
            dst3_r >>= 6;
4239
            dst3_l >>= 6;
4240
 
4241
            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
4242
                        dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
4243
            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
4244
                              tmp0, tmp1, tmp2, tmp3, 7,
4245
                              tmp0, tmp1, tmp2, tmp3);
4246
 
4247
            PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
4248
            ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
4249
            dst_tmp += (4 * dst_stride);
4250
        }
4251
 
4252
        src0_ptr += 8;
4253
        dst += 8;
4254
        src1_ptr += 8;
4255
    }
4256
}
4257
 
4258
static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr,
4259
                                 int32_t src_stride,
4260
                                 int16_t *src1_ptr,
4261
                                 int32_t src2_stride,
4262
                                 uint8_t *dst,
4263
                                 int32_t dst_stride,
4264
                                 const int8_t *filter_x,
4265
                                 const int8_t *filter_y,
4266
                                 int32_t height)
4267
{
4268
    if (2 == height) {
4269
        hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4270
                              dst, dst_stride, filter_x, filter_y, height);
4271
    } else if (6 == height) {
4272
        hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4273
                              dst, dst_stride, filter_x, filter_y, height);
4274
    } else {
4275
        hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
4276
                                      src1_ptr, src2_stride,
4277
                                      dst, dst_stride,
4278
                                      filter_x, filter_y, height, 8);
4279
    }
4280
}
4281
 
4282
static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
4283
                                  int32_t src_stride,
4284
                                  int16_t *src1_ptr,
4285
                                  int32_t src2_stride,
4286
                                  uint8_t *dst,
4287
                                  int32_t dst_stride,
4288
                                  const int8_t *filter_x,
4289
                                  const int8_t *filter_y,
4290
                                  int32_t height)
4291
{
4292
    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4293
                                  dst, dst_stride, filter_x, filter_y,
4294
                                  height, 8);
4295
    hevc_hv_bi_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
4296
                         dst + 8, dst_stride, filter_x, filter_y, height);
4297
}
4298
 
4299
static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr,
4300
                                  int32_t src_stride,
4301
                                  int16_t *src1_ptr,
4302
                                  int32_t src2_stride,
4303
                                  uint8_t *dst,
4304
                                  int32_t dst_stride,
4305
                                  const int8_t *filter_x,
4306
                                  const int8_t *filter_y,
4307
                                  int32_t height)
4308
{
4309
    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4310
                                  dst, dst_stride, filter_x, filter_y,
4311
                                  height, 16);
4312
}
4313
 
4314
static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr,
4315
                                  int32_t src_stride,
4316
                                  int16_t *src1_ptr,
4317
                                  int32_t src2_stride,
4318
                                  uint8_t *dst,
4319
                                  int32_t dst_stride,
4320
                                  const int8_t *filter_x,
4321
                                  const int8_t *filter_y,
4322
                                  int32_t height)
4323
{
4324
    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4325
                                  dst, dst_stride, filter_x, filter_y,
4326
                                  height, 24);
4327
}
4328
 
4329
static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr,
4330
                                  int32_t src_stride,
4331
                                  int16_t *src1_ptr,
4332
                                  int32_t src2_stride,
4333
                                  uint8_t *dst,
4334
                                  int32_t dst_stride,
4335
                                  const int8_t *filter_x,
4336
                                  const const int8_t *filter_y,
4337
                                  int32_t height)
4338
{
4339
    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4340
                                  dst, dst_stride, filter_x, filter_y,
4341
                                  height, 32);
4342
}
4343
 
4344
#define BI_MC_COPY(WIDTH)                                                 \
4345
void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
4346
                                                   ptrdiff_t dst_stride,  \
4347
                                                   uint8_t *src,          \
4348
                                                   ptrdiff_t src_stride,  \
4349
                                                   int16_t *src_16bit,    \
4350
                                                   int height,            \
4351
                                                   intptr_t mx,           \
4352
                                                   intptr_t my,           \
4353
                                                   int width)             \
4354
{                                                                         \
4355
    hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \
4356
                                dst, dst_stride, height);                 \
4357
}
4358
 
4359
BI_MC_COPY(4);
4360
BI_MC_COPY(6);
4361
BI_MC_COPY(8);
4362
BI_MC_COPY(12);
4363
BI_MC_COPY(16);
4364
BI_MC_COPY(24);
4365
BI_MC_COPY(32);
4366
BI_MC_COPY(48);
4367
BI_MC_COPY(64);
4368
 
4369
#undef BI_MC_COPY
4370
 
4371
#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                            \
4372
void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,            \
4373
                                                        ptrdiff_t dst_stride,  \
4374
                                                        uint8_t *src,          \
4375
                                                        ptrdiff_t src_stride,  \
4376
                                                        int16_t *src_16bit,    \
4377
                                                        int height,            \
4378
                                                        intptr_t mx,           \
4379
                                                        intptr_t my,           \
4380
                                                        int width)             \
4381
{                                                                              \
4382
    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
4383
                                                                               \
4384
    hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,       \
4385
                                             MAX_PB_SIZE, dst, dst_stride,     \
4386
                                             filter, height);                  \
4387
}
4388
 
4389
BI_MC(qpel, h, 4, 8, hz, mx);
4390
BI_MC(qpel, h, 8, 8, hz, mx);
4391
BI_MC(qpel, h, 12, 8, hz, mx);
4392
BI_MC(qpel, h, 16, 8, hz, mx);
4393
BI_MC(qpel, h, 24, 8, hz, mx);
4394
BI_MC(qpel, h, 32, 8, hz, mx);
4395
BI_MC(qpel, h, 48, 8, hz, mx);
4396
BI_MC(qpel, h, 64, 8, hz, mx);
4397
 
4398
BI_MC(qpel, v, 4, 8, vt, my);
4399
BI_MC(qpel, v, 8, 8, vt, my);
4400
BI_MC(qpel, v, 12, 8, vt, my);
4401
BI_MC(qpel, v, 16, 8, vt, my);
4402
BI_MC(qpel, v, 24, 8, vt, my);
4403
BI_MC(qpel, v, 32, 8, vt, my);
4404
BI_MC(qpel, v, 48, 8, vt, my);
4405
BI_MC(qpel, v, 64, 8, vt, my);
4406
 
4407
BI_MC(epel, h, 4, 4, hz, mx);
4408
BI_MC(epel, h, 8, 4, hz, mx);
4409
BI_MC(epel, h, 6, 4, hz, mx);
4410
BI_MC(epel, h, 12, 4, hz, mx);
4411
BI_MC(epel, h, 16, 4, hz, mx);
4412
BI_MC(epel, h, 24, 4, hz, mx);
4413
BI_MC(epel, h, 32, 4, hz, mx);
4414
 
4415
BI_MC(epel, v, 4, 4, vt, my);
4416
BI_MC(epel, v, 8, 4, vt, my);
4417
BI_MC(epel, v, 6, 4, vt, my);
4418
BI_MC(epel, v, 12, 4, vt, my);
4419
BI_MC(epel, v, 16, 4, vt, my);
4420
BI_MC(epel, v, 24, 4, vt, my);
4421
BI_MC(epel, v, 32, 4, vt, my);
4422
 
4423
#undef BI_MC
4424
 
4425
#define BI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                                   \
4426
void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,            \
4427
                                                        ptrdiff_t dst_stride,  \
4428
                                                        uint8_t *src,          \
4429
                                                        ptrdiff_t src_stride,  \
4430
                                                        int16_t *src_16bit,    \
4431
                                                        int height,            \
4432
                                                        intptr_t mx,           \
4433
                                                        intptr_t my,           \
4434
                                                        int width)             \
4435
{                                                                              \
4436
    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                  \
4437
    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                  \
4438
                                                                               \
4439
    hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,       \
4440
                                             MAX_PB_SIZE, dst, dst_stride,     \
4441
                                             filter_x, filter_y,               \
4442
                                             height);                          \
4443
}
4444
 
4445
BI_MC_HV(qpel, hv, 4, 8, hv);
4446
BI_MC_HV(qpel, hv, 8, 8, hv);
4447
BI_MC_HV(qpel, hv, 12, 8, hv);
4448
BI_MC_HV(qpel, hv, 16, 8, hv);
4449
BI_MC_HV(qpel, hv, 24, 8, hv);
4450
BI_MC_HV(qpel, hv, 32, 8, hv);
4451
BI_MC_HV(qpel, hv, 48, 8, hv);
4452
BI_MC_HV(qpel, hv, 64, 8, hv);
4453
 
4454
BI_MC_HV(epel, hv, 4, 4, hv);
4455
BI_MC_HV(epel, hv, 8, 4, hv);
4456
BI_MC_HV(epel, hv, 6, 4, hv);
4457
BI_MC_HV(epel, hv, 12, 4, hv);
4458
BI_MC_HV(epel, hv, 16, 4, hv);
4459
BI_MC_HV(epel, hv, 24, 4, hv);
4460
BI_MC_HV(epel, hv, 32, 4, hv);
4461
 
4462
#undef BI_MC_HV