Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6147 | serge | 1 | /* |
2 | * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) |
||
3 | * |
||
4 | * This file is part of FFmpeg. |
||
5 | * |
||
6 | * FFmpeg is free software; you can redistribute it and/or |
||
7 | * modify it under the terms of the GNU Lesser General Public |
||
8 | * License as published by the Free Software Foundation; either |
||
9 | * version 2.1 of the License, or (at your option) any later version. |
||
10 | * |
||
11 | * FFmpeg is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | * Lesser General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU Lesser General Public |
||
17 | * License along with FFmpeg; if not, write to the Free Software |
||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
19 | */ |
||
20 | |||
21 | #include "libavutil/mips/generic_macros_msa.h" |
||
22 | #include "libavcodec/mips/hevcdsp_mips.h" |
||
23 | #include "libavcodec/mips/hevc_macros_msa.h" |
||
24 | |||
25 | #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \ |
||
26 | { \ |
||
27 | ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ |
||
28 | SRARI_H2_SH(out0, out1, rnd_val); \ |
||
29 | CLIP_SH2_0_255(out0, out1); \ |
||
30 | } |
||
31 | |||
32 | #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \ |
||
33 | vec0, vec1, vec2, vec3, rnd_val, \ |
||
34 | out0, out1, out2, out3) \ |
||
35 | { \ |
||
36 | HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \ |
||
37 | HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \ |
||
38 | } |
||
39 | |||
40 | static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr, |
||
41 | int32_t src_stride, |
||
42 | int16_t *src1_ptr, |
||
43 | int32_t src2_stride, |
||
44 | uint8_t *dst, |
||
45 | int32_t dst_stride, |
||
46 | int32_t height) |
||
47 | { |
||
48 | v16i8 zero = { 0 }; |
||
49 | |||
50 | if (2 == height) { |
||
51 | v16i8 src0, src1; |
||
52 | v8i16 dst0, in0, in1; |
||
53 | |||
54 | LD_SB2(src0_ptr, src_stride, src0, src1); |
||
55 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
56 | |||
57 | src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0); |
||
58 | in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); |
||
59 | |||
60 | dst0 = (v8i16) __msa_ilvr_b(zero, src0); |
||
61 | dst0 <<= 6; |
||
62 | dst0 += in0; |
||
63 | dst0 = __msa_srari_h(dst0, 7); |
||
64 | dst0 = CLIP_SH_0_255(dst0); |
||
65 | |||
66 | dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); |
||
67 | ST4x2_UB(dst0, dst, dst_stride); |
||
68 | } else if (4 == height) { |
||
69 | v16i8 src0, src1, src2, src3; |
||
70 | v8i16 dst0, dst1; |
||
71 | v8i16 in0, in1, in2, in3; |
||
72 | |||
73 | LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); |
||
74 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
75 | ILVR_W2_SB(src1, src0, src3, src2, src0, src1); |
||
76 | ILVR_D2_SH(in1, in0, in3, in2, in0, in1); |
||
77 | ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1); |
||
78 | |||
79 | dst0 <<= 6; |
||
80 | dst1 <<= 6; |
||
81 | HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); |
||
82 | |||
83 | dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); |
||
84 | ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); |
||
85 | } else if (0 == height % 8) { |
||
86 | uint32_t loop_cnt; |
||
87 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7; |
||
88 | v8i16 dst0, dst1, dst2, dst3; |
||
89 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
90 | |||
91 | for (loop_cnt = (height >> 3); loop_cnt--;) { |
||
92 | LD_SB8(src0_ptr, src_stride, |
||
93 | src0, src1, src2, src3, src4, src5, src6, src7); |
||
94 | src0_ptr += (8 * src_stride); |
||
95 | |||
96 | LD_SH8(src1_ptr, src2_stride, |
||
97 | in0, in1, in2, in3, in4, in5, in6, in7); |
||
98 | src1_ptr += (8 * src2_stride); |
||
99 | |||
100 | ILVR_D2_SH(in1, in0, in3, in2, in0, in1); |
||
101 | ILVR_D2_SH(in5, in4, in7, in6, in2, in3); |
||
102 | |||
103 | ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6, |
||
104 | src0, src1, src2, src3); |
||
105 | ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, |
||
106 | dst0, dst1, dst2, dst3); |
||
107 | |||
108 | SLLI_4V(dst0, dst1, dst2, dst3, 6); |
||
109 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
110 | dst0, dst1, dst2, dst3, 7, |
||
111 | dst0, dst1, dst2, dst3); |
||
112 | |||
113 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
114 | ST4x8_UB(dst0, dst1, dst, dst_stride); |
||
115 | dst += (8 * dst_stride); |
||
116 | } |
||
117 | } |
||
118 | } |
||
119 | |||
120 | static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr, |
||
121 | int32_t src_stride, |
||
122 | int16_t *src1_ptr, |
||
123 | int32_t src2_stride, |
||
124 | uint8_t *dst, |
||
125 | int32_t dst_stride, |
||
126 | int32_t height) |
||
127 | { |
||
128 | uint32_t loop_cnt; |
||
129 | v16i8 zero = { 0 }; |
||
130 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7; |
||
131 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
132 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; |
||
133 | |||
134 | for (loop_cnt = (height >> 3); loop_cnt--;) { |
||
135 | LD_SB8(src0_ptr, src_stride, |
||
136 | src0, src1, src2, src3, src4, src5, src6, src7); |
||
137 | src0_ptr += (8 * src_stride); |
||
138 | LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); |
||
139 | src1_ptr += (8 * src2_stride); |
||
140 | ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, |
||
141 | dst0, dst1, dst2, dst3); |
||
142 | ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, |
||
143 | dst4, dst5, dst6, dst7); |
||
144 | |||
145 | SLLI_4V(dst0, dst1, dst2, dst3, 6); |
||
146 | SLLI_4V(dst4, dst5, dst6, dst7, 6); |
||
147 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
148 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
149 | |||
150 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
151 | ST6x4_UB(dst0, dst1, dst, dst_stride); |
||
152 | dst += (4 * dst_stride); |
||
153 | |||
154 | HEVC_BI_RND_CLIP4(in4, in5, in6, in7, |
||
155 | dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7); |
||
156 | |||
157 | PCKEV_B2_SH(dst5, dst4, dst7, dst6, dst4, dst5); |
||
158 | ST6x4_UB(dst4, dst5, dst, dst_stride); |
||
159 | dst += (4 * dst_stride); |
||
160 | } |
||
161 | } |
||
162 | |||
163 | static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr, |
||
164 | int32_t src_stride, |
||
165 | int16_t *src1_ptr, |
||
166 | int32_t src2_stride, |
||
167 | uint8_t *dst, |
||
168 | int32_t dst_stride, |
||
169 | int32_t height) |
||
170 | { |
||
171 | v16i8 zero = { 0 }; |
||
172 | |||
173 | if (2 == height) { |
||
174 | v16i8 src0, src1; |
||
175 | v8i16 in0, in1; |
||
176 | v8i16 dst0, dst1; |
||
177 | |||
178 | LD_SB2(src0_ptr, src_stride, src0, src1); |
||
179 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
180 | ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1); |
||
181 | |||
182 | dst0 <<= 6; |
||
183 | dst1 <<= 6; |
||
184 | HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); |
||
185 | |||
186 | dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); |
||
187 | ST8x2_UB(dst0, dst, dst_stride); |
||
188 | } else if (4 == height) { |
||
189 | v16i8 src0, src1, src2, src3; |
||
190 | v8i16 in0, in1, in2, in3; |
||
191 | v8i16 dst0, dst1, dst2, dst3; |
||
192 | |||
193 | LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); |
||
194 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
195 | ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, |
||
196 | dst0, dst1, dst2, dst3); |
||
197 | |||
198 | SLLI_4V(dst0, dst1, dst2, dst3, 6); |
||
199 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
200 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
201 | |||
202 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
203 | ST8x4_UB(dst0, dst1, dst, dst_stride); |
||
204 | } else if (6 == height) { |
||
205 | v16i8 src0, src1, src2, src3, src4, src5; |
||
206 | v8i16 in0, in1, in2, in3, in4, in5; |
||
207 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5; |
||
208 | |||
209 | LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); |
||
210 | LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); |
||
211 | ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, |
||
212 | dst0, dst1, dst2, dst3); |
||
213 | ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5); |
||
214 | |||
215 | SLLI_4V(dst0, dst1, dst2, dst3, 6); |
||
216 | dst4 <<= 6; |
||
217 | dst5 <<= 6; |
||
218 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
219 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
220 | HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); |
||
221 | |||
222 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
223 | dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); |
||
224 | ST8x4_UB(dst0, dst1, dst, dst_stride); |
||
225 | dst += (4 * dst_stride); |
||
226 | ST8x2_UB(dst2, dst, dst_stride); |
||
227 | } else if (0 == height % 8) { |
||
228 | v16i8 src0, src1, src2, src3; |
||
229 | v8i16 in0, in1, in2, in3; |
||
230 | v8i16 dst0, dst1, dst2, dst3; |
||
231 | uint32_t loop_cnt; |
||
232 | |||
233 | for (loop_cnt = (height >> 3); loop_cnt--;) { |
||
234 | LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); |
||
235 | src0_ptr += (4 * src_stride); |
||
236 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
237 | src1_ptr += (4 * src2_stride); |
||
238 | ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, |
||
239 | dst0, dst1, dst2, dst3); |
||
240 | |||
241 | SLLI_4V(dst0, dst1, dst2, dst3, 6); |
||
242 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
243 | dst0, dst1, dst2, dst3, 7, |
||
244 | dst0, dst1, dst2, dst3); |
||
245 | |||
246 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
247 | ST8x4_UB(dst0, dst1, dst, dst_stride); |
||
248 | dst += (4 * dst_stride); |
||
249 | |||
250 | LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); |
||
251 | src0_ptr += (4 * src_stride); |
||
252 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
253 | src1_ptr += (4 * src2_stride); |
||
254 | ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, |
||
255 | dst0, dst1, dst2, dst3); |
||
256 | |||
257 | SLLI_4V(dst0, dst1, dst2, dst3, 6); |
||
258 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
259 | dst0, dst1, dst2, dst3, 7, |
||
260 | dst0, dst1, dst2, dst3); |
||
261 | |||
262 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
263 | ST8x4_UB(dst0, dst1, dst, dst_stride); |
||
264 | dst += (4 * dst_stride); |
||
265 | } |
||
266 | } |
||
267 | } |
||
268 | |||
269 | static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr, |
||
270 | int32_t src_stride, |
||
271 | int16_t *src1_ptr, |
||
272 | int32_t src2_stride, |
||
273 | uint8_t *dst, |
||
274 | int32_t dst_stride, |
||
275 | int32_t height) |
||
276 | { |
||
277 | uint32_t loop_cnt; |
||
278 | v16i8 src0, src1, src2, src3; |
||
279 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
280 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5; |
||
281 | v16i8 zero = { 0 }; |
||
282 | |||
283 | for (loop_cnt = (16 >> 2); loop_cnt--;) { |
||
284 | LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); |
||
285 | src0_ptr += (4 * src_stride); |
||
286 | |||
287 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
288 | LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); |
||
289 | src1_ptr += (4 * src2_stride); |
||
290 | ILVR_D2_SH(in5, in4, in7, in6, in4, in5); |
||
291 | ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, |
||
292 | dst0, dst1, dst2, dst3); |
||
293 | |||
294 | SLLI_4V(dst0, dst1, dst2, dst3, 6); |
||
295 | ILVL_W2_SB(src1, src0, src3, src2, src0, src1); |
||
296 | ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5); |
||
297 | dst4 <<= 6; |
||
298 | dst5 <<= 6; |
||
299 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
300 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
301 | HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); |
||
302 | |||
303 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
304 | dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); |
||
305 | ST12x4_UB(dst0, dst1, dst2, dst, dst_stride); |
||
306 | dst += (4 * dst_stride); |
||
307 | } |
||
308 | } |
||
309 | |||
310 | static void hevc_bi_copy_16multx4mult_msa(uint8_t *src0_ptr, |
||
311 | int32_t src_stride, |
||
312 | int16_t *src1_ptr, |
||
313 | int32_t src2_stride, |
||
314 | uint8_t *dst, |
||
315 | int32_t dst_stride, |
||
316 | int32_t height, |
||
317 | int32_t width) |
||
318 | { |
||
319 | uint32_t loop_cnt; |
||
320 | uint32_t cnt; |
||
321 | uint8_t *src0_ptr_tmp; |
||
322 | int16_t *src1_ptr_tmp; |
||
323 | uint8_t *dst_tmp; |
||
324 | v16i8 zero = { 0 }; |
||
325 | |||
326 | for (cnt = (width >> 4); cnt--;) { |
||
327 | src0_ptr_tmp = src0_ptr; |
||
328 | src1_ptr_tmp = src1_ptr; |
||
329 | dst_tmp = dst; |
||
330 | |||
331 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
332 | v16i8 src0, src1, src2, src3; |
||
333 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
334 | v8i16 dst0_r, dst1_r, dst2_r, dst3_r; |
||
335 | v8i16 dst0_l, dst1_l, dst2_l, dst3_l; |
||
336 | |||
337 | LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3); |
||
338 | src0_ptr_tmp += (4 * src_stride); |
||
339 | LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); |
||
340 | LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7); |
||
341 | src1_ptr_tmp += (4 * src2_stride); |
||
342 | |||
343 | ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, |
||
344 | dst0_r, dst1_r, dst2_r, dst3_r); |
||
345 | ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, |
||
346 | dst0_l, dst1_l, dst2_l, dst3_l); |
||
347 | |||
348 | SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); |
||
349 | SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); |
||
350 | HEVC_BI_RND_CLIP4(in0, in1, in4, in5, |
||
351 | dst0_r, dst1_r, dst0_l, dst1_l, 7, |
||
352 | dst0_r, dst1_r, dst0_l, dst1_l); |
||
353 | |||
354 | PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); |
||
355 | ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride); |
||
356 | dst_tmp += (2 * dst_stride); |
||
357 | |||
358 | HEVC_BI_RND_CLIP4(in2, in3, in6, in7, |
||
359 | dst2_r, dst3_r, dst2_l, dst3_l, 7, |
||
360 | dst2_r, dst3_r, dst2_l, dst3_l); |
||
361 | |||
362 | PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r); |
||
363 | ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride); |
||
364 | dst_tmp += (2 * dst_stride); |
||
365 | } |
||
366 | |||
367 | src0_ptr += 16; |
||
368 | src1_ptr += 16; |
||
369 | dst += 16; |
||
370 | } |
||
371 | } |
||
372 | |||
373 | static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr, |
||
374 | int32_t src_stride, |
||
375 | int16_t *src1_ptr, |
||
376 | int32_t src2_stride, |
||
377 | uint8_t *dst, |
||
378 | int32_t dst_stride, |
||
379 | int32_t height) |
||
380 | { |
||
381 | hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
382 | dst, dst_stride, height, 16); |
||
383 | } |
||
384 | |||
385 | static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr, |
||
386 | int32_t src_stride, |
||
387 | int16_t *src1_ptr, |
||
388 | int32_t src2_stride, |
||
389 | uint8_t *dst, |
||
390 | int32_t dst_stride, |
||
391 | int32_t height) |
||
392 | { |
||
393 | hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
394 | dst, dst_stride, height, 16); |
||
395 | |||
396 | hevc_bi_copy_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, |
||
397 | dst + 16, dst_stride, height); |
||
398 | } |
||
399 | |||
400 | static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr, |
||
401 | int32_t src_stride, |
||
402 | int16_t *src1_ptr, |
||
403 | int32_t src2_stride, |
||
404 | uint8_t *dst, |
||
405 | int32_t dst_stride, |
||
406 | int32_t height) |
||
407 | { |
||
408 | hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
409 | dst, dst_stride, height, 32); |
||
410 | } |
||
411 | |||
412 | static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr, |
||
413 | int32_t src_stride, |
||
414 | int16_t *src1_ptr, |
||
415 | int32_t src2_stride, |
||
416 | uint8_t *dst, |
||
417 | int32_t dst_stride, |
||
418 | int32_t height) |
||
419 | { |
||
420 | hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
421 | dst, dst_stride, height, 48); |
||
422 | } |
||
423 | |||
424 | static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr, |
||
425 | int32_t src_stride, |
||
426 | int16_t *src1_ptr, |
||
427 | int32_t src2_stride, |
||
428 | uint8_t *dst, |
||
429 | int32_t dst_stride, |
||
430 | int32_t height) |
||
431 | { |
||
432 | hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
433 | dst, dst_stride, height, 64); |
||
434 | } |
||
435 | |||
436 | static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr, |
||
437 | int32_t src_stride, |
||
438 | int16_t *src1_ptr, |
||
439 | int32_t src2_stride, |
||
440 | uint8_t *dst, |
||
441 | int32_t dst_stride, |
||
442 | const int8_t *filter, |
||
443 | int32_t height) |
||
444 | { |
||
445 | uint32_t loop_cnt; |
||
446 | v8i16 filt0, filt1, filt2, filt3; |
||
447 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7; |
||
448 | v16i8 mask1, mask2, mask3; |
||
449 | v16i8 vec0, vec1, vec2, vec3; |
||
450 | v8i16 dst0, dst1, dst2, dst3; |
||
451 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
452 | v8i16 filter_vec, const_vec; |
||
453 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; |
||
454 | |||
455 | src0_ptr -= 3; |
||
456 | |||
457 | /* rearranging filter */ |
||
458 | filter_vec = LD_SH(filter); |
||
459 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
460 | |||
461 | mask1 = mask0 + 2; |
||
462 | mask2 = mask0 + 4; |
||
463 | mask3 = mask0 + 6; |
||
464 | |||
465 | const_vec = __msa_ldi_h(128); |
||
466 | const_vec <<= 6; |
||
467 | |||
468 | for (loop_cnt = (height >> 3); loop_cnt--;) { |
||
469 | LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3, |
||
470 | src4, src5, src6, src7); |
||
471 | src0_ptr += (8 * src_stride); |
||
472 | LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); |
||
473 | src1_ptr += (8 * src2_stride); |
||
474 | |||
475 | ILVR_D2_SH(in1, in0, in3, in2, in0, in1); |
||
476 | ILVR_D2_SH(in5, in4, in7, in6, in2, in3); |
||
477 | XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); |
||
478 | |||
479 | VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, |
||
480 | vec0, vec1, vec2, vec3); |
||
481 | dst0 = const_vec; |
||
482 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
483 | dst0, dst0, dst0, dst0); |
||
484 | VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, |
||
485 | vec0, vec1, vec2, vec3); |
||
486 | dst1 = const_vec; |
||
487 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
488 | dst1, dst1, dst1, dst1); |
||
489 | VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3, |
||
490 | vec0, vec1, vec2, vec3); |
||
491 | dst2 = const_vec; |
||
492 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
493 | dst2, dst2, dst2, dst2); |
||
494 | VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3, |
||
495 | vec0, vec1, vec2, vec3); |
||
496 | dst3 = const_vec; |
||
497 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
498 | dst3, dst3, dst3, dst3); |
||
499 | |||
500 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
501 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
502 | |||
503 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
504 | ST4x8_UB(dst0, dst1, dst, dst_stride); |
||
505 | dst += (8 * dst_stride); |
||
506 | } |
||
507 | } |
||
508 | |||
509 | static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr, |
||
510 | int32_t src_stride, |
||
511 | int16_t *src1_ptr, |
||
512 | int32_t src2_stride, |
||
513 | uint8_t *dst, |
||
514 | int32_t dst_stride, |
||
515 | const int8_t *filter, |
||
516 | int32_t height) |
||
517 | { |
||
518 | uint32_t loop_cnt; |
||
519 | v8i16 filt0, filt1, filt2, filt3; |
||
520 | v16i8 src0, src1, src2, src3; |
||
521 | v16i8 mask1, mask2, mask3; |
||
522 | v16i8 vec0, vec1, vec2, vec3; |
||
523 | v8i16 dst0, dst1, dst2, dst3; |
||
524 | v8i16 in0, in1, in2, in3; |
||
525 | v8i16 filter_vec, const_vec; |
||
526 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
527 | |||
528 | src0_ptr -= 3; |
||
529 | |||
530 | const_vec = __msa_ldi_h(128); |
||
531 | const_vec <<= 6; |
||
532 | |||
533 | filter_vec = LD_SH(filter); |
||
534 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
535 | |||
536 | mask1 = mask0 + 2; |
||
537 | mask2 = mask0 + 4; |
||
538 | mask3 = mask0 + 6; |
||
539 | |||
540 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
541 | LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); |
||
542 | src0_ptr += (4 * src_stride); |
||
543 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
544 | src1_ptr += (4 * src2_stride); |
||
545 | XORI_B4_128_SB(src0, src1, src2, src3); |
||
546 | |||
547 | VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, |
||
548 | vec0, vec1, vec2, vec3); |
||
549 | dst0 = const_vec; |
||
550 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
551 | dst0, dst0, dst0, dst0); |
||
552 | VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, |
||
553 | vec0, vec1, vec2, vec3); |
||
554 | dst1 = const_vec; |
||
555 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
556 | dst1, dst1, dst1, dst1); |
||
557 | VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, |
||
558 | vec0, vec1, vec2, vec3); |
||
559 | dst2 = const_vec; |
||
560 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
561 | dst2, dst2, dst2, dst2); |
||
562 | VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, |
||
563 | vec0, vec1, vec2, vec3); |
||
564 | dst3 = const_vec; |
||
565 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
566 | dst3, dst3, dst3, dst3); |
||
567 | |||
568 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
569 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
570 | |||
571 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
572 | ST8x4_UB(dst0, dst1, dst, dst_stride); |
||
573 | dst += (4 * dst_stride); |
||
574 | } |
||
575 | } |
||
576 | |||
577 | static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr, |
||
578 | int32_t src_stride, |
||
579 | int16_t *src1_ptr, |
||
580 | int32_t src2_stride, |
||
581 | uint8_t *dst, |
||
582 | int32_t dst_stride, |
||
583 | const int8_t *filter, |
||
584 | int32_t height) |
||
585 | { |
||
586 | hevc_hz_bi_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
587 | dst, dst_stride, filter, height); |
||
588 | hevc_hz_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride, |
||
589 | dst + 8, dst_stride, filter, height); |
||
590 | } |
||
591 | |||
592 | static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr, |
||
593 | int32_t src_stride, |
||
594 | int16_t *src1_ptr, |
||
595 | int32_t src2_stride, |
||
596 | uint8_t *dst, |
||
597 | int32_t dst_stride, |
||
598 | const int8_t *filter, |
||
599 | int32_t height) |
||
600 | { |
||
601 | uint32_t loop_cnt; |
||
602 | v16i8 src0, src1, src2, src3; |
||
603 | v8i16 filt0, filt1, filt2, filt3; |
||
604 | v16i8 mask1, mask2, mask3; |
||
605 | v16i8 vec0, vec1, vec2, vec3; |
||
606 | v8i16 dst0, dst1, dst2, dst3; |
||
607 | v8i16 in0, in1, in2, in3; |
||
608 | v8i16 filter_vec, const_vec; |
||
609 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
610 | |||
611 | src0_ptr -= 3; |
||
612 | const_vec = __msa_ldi_h(128); |
||
613 | const_vec <<= 6; |
||
614 | |||
615 | filter_vec = LD_SH(filter); |
||
616 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
617 | |||
618 | mask1 = mask0 + 2; |
||
619 | mask2 = mask0 + 4; |
||
620 | mask3 = mask0 + 6; |
||
621 | |||
622 | for (loop_cnt = (height >> 1); loop_cnt--;) { |
||
623 | LD_SB2(src0_ptr, 8, src0, src1); |
||
624 | src0_ptr += src_stride; |
||
625 | LD_SB2(src0_ptr, 8, src2, src3); |
||
626 | src0_ptr += src_stride; |
||
627 | LD_SH2(src1_ptr, 8, in0, in1); |
||
628 | src1_ptr += src2_stride; |
||
629 | LD_SH2(src1_ptr, 8, in2, in3); |
||
630 | src1_ptr += src2_stride; |
||
631 | XORI_B4_128_SB(src0, src1, src2, src3); |
||
632 | |||
633 | VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, |
||
634 | vec0, vec1, vec2, vec3); |
||
635 | dst0 = const_vec; |
||
636 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
637 | dst0, dst0, dst0, dst0); |
||
638 | VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, |
||
639 | vec0, vec1, vec2, vec3); |
||
640 | dst1 = const_vec; |
||
641 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
642 | dst1, dst1, dst1, dst1); |
||
643 | VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, |
||
644 | vec0, vec1, vec2, vec3); |
||
645 | dst2 = const_vec; |
||
646 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
647 | dst2, dst2, dst2, dst2); |
||
648 | VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, |
||
649 | vec0, vec1, vec2, vec3); |
||
650 | dst3 = const_vec; |
||
651 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
652 | dst3, dst3, dst3, dst3); |
||
653 | |||
654 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
655 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
656 | |||
657 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
658 | ST_SH2(dst0, dst1, dst, dst_stride); |
||
659 | dst += (2 * dst_stride); |
||
660 | } |
||
661 | } |
||
662 | |||
663 | static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr, |
||
664 | int32_t src_stride, |
||
665 | int16_t *src1_ptr, |
||
666 | int32_t src2_stride, |
||
667 | uint8_t *dst, |
||
668 | int32_t dst_stride, |
||
669 | const int8_t *filter, |
||
670 | int32_t height) |
||
671 | { |
||
672 | uint32_t loop_cnt; |
||
673 | uint64_t dst_val0; |
||
674 | v16i8 src0, src1, tmp0, tmp1; |
||
675 | v8i16 filt0, filt1, filt2, filt3; |
||
676 | v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; |
||
677 | v16i8 vec0, vec1, vec2, vec3; |
||
678 | v8i16 dst0, dst1, dst2; |
||
679 | v8i16 in0, in1, in2; |
||
680 | v8i16 filter_vec, const_vec; |
||
681 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
682 | |||
683 | src0_ptr = src0_ptr - 3; |
||
684 | const_vec = __msa_ldi_h(128); |
||
685 | const_vec <<= 6; |
||
686 | |||
687 | filter_vec = LD_SH(filter); |
||
688 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
689 | |||
690 | mask1 = mask0 + 2; |
||
691 | mask2 = mask0 + 4; |
||
692 | mask3 = mask0 + 6; |
||
693 | mask4 = mask0 + 8; |
||
694 | mask5 = mask0 + 10; |
||
695 | mask6 = mask0 + 12; |
||
696 | mask7 = mask0 + 14; |
||
697 | |||
698 | for (loop_cnt = height; loop_cnt--;) { |
||
699 | LD_SB2(src0_ptr, 16, src0, src1); |
||
700 | src0_ptr += src_stride; |
||
701 | LD_SH2(src1_ptr, 8, in0, in1); |
||
702 | in2 = LD_SH(src1_ptr + 16); |
||
703 | src1_ptr += src2_stride; |
||
704 | XORI_B2_128_SB(src0, src1); |
||
705 | |||
706 | VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, |
||
707 | vec0, vec1, vec2, vec3); |
||
708 | dst0 = const_vec; |
||
709 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
710 | dst0, dst0, dst0, dst0); |
||
711 | VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, |
||
712 | vec0, vec1, vec2, vec3); |
||
713 | dst1 = const_vec; |
||
714 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
715 | dst1, dst1, dst1, dst1); |
||
716 | VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, |
||
717 | vec0, vec1, vec2, vec3); |
||
718 | dst2 = const_vec; |
||
719 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
720 | dst2, dst2, dst2, dst2); |
||
721 | |||
722 | HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); |
||
723 | dst2 = __msa_adds_s_h(dst2, in2); |
||
724 | dst2 = __msa_srari_h(dst2, 7); |
||
725 | dst2 = CLIP_SH_0_255(dst2); |
||
726 | |||
727 | PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1); |
||
728 | dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0); |
||
729 | ST_SB(tmp0, dst); |
||
730 | SD(dst_val0, dst + 16); |
||
731 | dst += dst_stride; |
||
732 | } |
||
733 | } |
||
734 | |||
735 | static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr, |
||
736 | int32_t src_stride, |
||
737 | int16_t *src1_ptr, |
||
738 | int32_t src2_stride, |
||
739 | uint8_t *dst, |
||
740 | int32_t dst_stride, |
||
741 | const int8_t *filter, |
||
742 | int32_t height) |
||
743 | { |
||
744 | uint32_t loop_cnt; |
||
745 | v16i8 src0, src1, src2, tmp0, tmp1; |
||
746 | v8i16 filt0, filt1, filt2, filt3; |
||
747 | v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; |
||
748 | v16i8 vec0, vec1, vec2, vec3; |
||
749 | v8i16 dst0, dst1, dst2, dst3; |
||
750 | v8i16 in0, in1, in2, in3; |
||
751 | v8i16 filter_vec, const_vec; |
||
752 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
753 | |||
754 | src0_ptr -= 3; |
||
755 | const_vec = __msa_ldi_h(128); |
||
756 | const_vec <<= 6; |
||
757 | |||
758 | filter_vec = LD_SH(filter); |
||
759 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
760 | |||
761 | mask1 = mask0 + 2; |
||
762 | mask2 = mask0 + 4; |
||
763 | mask3 = mask0 + 6; |
||
764 | mask4 = mask0 + 8; |
||
765 | mask5 = mask0 + 10; |
||
766 | mask6 = mask0 + 12; |
||
767 | mask7 = mask0 + 14; |
||
768 | |||
769 | for (loop_cnt = height; loop_cnt--;) { |
||
770 | LD_SB2(src0_ptr, 16, src0, src1); |
||
771 | src2 = LD_SB(src0_ptr + 24); |
||
772 | src0_ptr += src_stride; |
||
773 | LD_SH4(src1_ptr, 8, in0, in1, in2, in3); |
||
774 | src1_ptr += src2_stride; |
||
775 | XORI_B3_128_SB(src0, src1, src2); |
||
776 | |||
777 | VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, |
||
778 | vec0, vec1, vec2, vec3); |
||
779 | dst0 = const_vec; |
||
780 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
781 | dst0, dst0, dst0, dst0); |
||
782 | VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, |
||
783 | vec0, vec1, vec2, vec3); |
||
784 | dst1 = const_vec; |
||
785 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
786 | dst1, dst1, dst1, dst1); |
||
787 | VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, |
||
788 | vec0, vec1, vec2, vec3); |
||
789 | dst2 = const_vec; |
||
790 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
791 | dst2, dst2, dst2, dst2); |
||
792 | VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, |
||
793 | vec0, vec1, vec2, vec3); |
||
794 | dst3 = const_vec; |
||
795 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
796 | dst3, dst3, dst3, dst3); |
||
797 | |||
798 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
799 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
800 | |||
801 | PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); |
||
802 | ST_SB2(tmp0, tmp1, dst, 16); |
||
803 | dst += dst_stride; |
||
804 | } |
||
805 | } |
||
806 | |||
807 | static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr, |
||
808 | int32_t src_stride, |
||
809 | int16_t *src1_ptr, |
||
810 | int32_t src2_stride, |
||
811 | uint8_t *dst, |
||
812 | int32_t dst_stride, |
||
813 | const int8_t *filter, |
||
814 | int32_t height) |
||
815 | { |
||
816 | uint32_t loop_cnt; |
||
817 | v16i8 src0, src1, src2, src3; |
||
818 | v16i8 tmp0, tmp1, tmp2; |
||
819 | v8i16 filt0, filt1, filt2, filt3; |
||
820 | v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; |
||
821 | v16i8 vec0, vec1, vec2, vec3; |
||
822 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5; |
||
823 | v8i16 in0, in1, in2, in3, in4, in5; |
||
824 | v8i16 filter_vec, const_vec; |
||
825 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
826 | |||
827 | src0_ptr -= 3; |
||
828 | |||
829 | const_vec = __msa_ldi_h(128); |
||
830 | const_vec <<= 6; |
||
831 | |||
832 | filter_vec = LD_SH(filter); |
||
833 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
834 | |||
835 | mask1 = mask0 + 2; |
||
836 | mask2 = mask0 + 4; |
||
837 | mask3 = mask0 + 6; |
||
838 | mask4 = mask0 + 8; |
||
839 | mask5 = mask0 + 10; |
||
840 | mask6 = mask0 + 12; |
||
841 | mask7 = mask0 + 14; |
||
842 | |||
843 | for (loop_cnt = height; loop_cnt--;) { |
||
844 | LD_SB2(src0_ptr, 16, src0, src1); |
||
845 | XORI_B2_128_SB(src0, src1); |
||
846 | LD_SH2(src1_ptr, 8, in0, in1); |
||
847 | |||
848 | VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, |
||
849 | vec0, vec1, vec2, vec3); |
||
850 | dst0 = const_vec; |
||
851 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
852 | dst0, dst0, dst0, dst0); |
||
853 | VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, |
||
854 | vec0, vec1, vec2, vec3); |
||
855 | dst1 = const_vec; |
||
856 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
857 | dst1, dst1, dst1, dst1); |
||
858 | |||
859 | HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); |
||
860 | |||
861 | tmp0 = __msa_pckev_b((v16i8) dst1, (v16i8) dst0); |
||
862 | ST_SB(tmp0, dst); |
||
863 | |||
864 | LD_SB2(src0_ptr + 32, 8, src2, src3); |
||
865 | XORI_B2_128_SB(src2, src3); |
||
866 | src0_ptr += src_stride; |
||
867 | |||
868 | LD_SH2(src1_ptr + 16, 8, in2, in3); |
||
869 | |||
870 | VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, |
||
871 | vec0, vec1, vec2, vec3); |
||
872 | dst2 = const_vec; |
||
873 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
874 | dst2, dst2, dst2, dst2); |
||
875 | VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, |
||
876 | vec0, vec1, vec2, vec3); |
||
877 | dst3 = const_vec; |
||
878 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
879 | dst3, dst3, dst3, dst3); |
||
880 | |||
881 | HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3); |
||
882 | |||
883 | tmp1 = __msa_pckev_b((v16i8) dst3, (v16i8) dst2); |
||
884 | ST_SB(tmp1, dst + 16); |
||
885 | |||
886 | LD_SH2(src1_ptr + 32, 8, in4, in5); |
||
887 | src1_ptr += src2_stride; |
||
888 | |||
889 | VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, |
||
890 | vec0, vec1, vec2, vec3); |
||
891 | dst4 = const_vec; |
||
892 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
893 | dst4, dst4, dst4, dst4); |
||
894 | VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, |
||
895 | vec0, vec1, vec2, vec3); |
||
896 | dst5 = const_vec; |
||
897 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
898 | dst5, dst5, dst5, dst5); |
||
899 | |||
900 | HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); |
||
901 | |||
902 | tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4); |
||
903 | ST_SB(tmp2, dst + 32); |
||
904 | dst += dst_stride; |
||
905 | } |
||
906 | } |
||
907 | |||
908 | static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr, |
||
909 | int32_t src_stride, |
||
910 | int16_t *src1_ptr, |
||
911 | int32_t src2_stride, |
||
912 | uint8_t *dst, |
||
913 | int32_t dst_stride, |
||
914 | const int8_t *filter, |
||
915 | int32_t height) |
||
916 | { |
||
917 | uint8_t *src0_ptr_tmp; |
||
918 | uint8_t *dst_tmp; |
||
919 | int16_t *src1_ptr_tmp; |
||
920 | uint32_t loop_cnt; |
||
921 | uint32_t cnt; |
||
922 | v16i8 src0, src1, src2, tmp0, tmp1; |
||
923 | v8i16 filt0, filt1, filt2, filt3; |
||
924 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
925 | v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; |
||
926 | v16i8 vec0, vec1, vec2, vec3; |
||
927 | v8i16 dst0, dst1, dst2, dst3; |
||
928 | v8i16 in0, in1, in2, in3; |
||
929 | v8i16 filter_vec, const_vec; |
||
930 | |||
931 | src0_ptr -= 3; |
||
932 | |||
933 | const_vec = __msa_ldi_h(128); |
||
934 | const_vec <<= 6; |
||
935 | |||
936 | filter_vec = LD_SH(filter); |
||
937 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
938 | |||
939 | mask1 = mask0 + 2; |
||
940 | mask2 = mask0 + 4; |
||
941 | mask3 = mask0 + 6; |
||
942 | mask4 = mask0 + 8; |
||
943 | mask5 = mask0 + 10; |
||
944 | mask6 = mask0 + 12; |
||
945 | mask7 = mask0 + 14; |
||
946 | |||
947 | for (loop_cnt = height; loop_cnt--;) { |
||
948 | src0_ptr_tmp = src0_ptr; |
||
949 | dst_tmp = dst; |
||
950 | src1_ptr_tmp = src1_ptr; |
||
951 | |||
952 | for (cnt = 2; cnt--;) { |
||
953 | LD_SB2(src0_ptr_tmp, 16, src0, src1); |
||
954 | src2 = LD_SB(src0_ptr_tmp + 24); |
||
955 | src0_ptr_tmp += 32; |
||
956 | LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3); |
||
957 | src1_ptr_tmp += 32; |
||
958 | XORI_B3_128_SB(src0, src1, src2); |
||
959 | |||
960 | VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, |
||
961 | vec0, vec1, vec2, vec3); |
||
962 | dst0 = const_vec; |
||
963 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
964 | dst0, dst0, dst0, dst0); |
||
965 | VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, |
||
966 | vec0, vec1, vec2, vec3); |
||
967 | dst1 = const_vec; |
||
968 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
969 | dst1, dst1, dst1, dst1); |
||
970 | VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, |
||
971 | vec0, vec1, vec2, vec3); |
||
972 | dst2 = const_vec; |
||
973 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
974 | dst2, dst2, dst2, dst2); |
||
975 | VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, |
||
976 | vec0, vec1, vec2, vec3); |
||
977 | dst3 = const_vec; |
||
978 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
979 | dst3, dst3, dst3, dst3); |
||
980 | |||
981 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
982 | dst0, dst1, dst2, dst3, 7, |
||
983 | dst0, dst1, dst2, dst3); |
||
984 | |||
985 | PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); |
||
986 | ST_SB2(tmp0, tmp1, dst_tmp, 16); |
||
987 | dst_tmp += 32; |
||
988 | } |
||
989 | |||
990 | src1_ptr += src2_stride; |
||
991 | src0_ptr += src_stride; |
||
992 | dst += dst_stride; |
||
993 | } |
||
994 | } |
||
995 | |||
996 | static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr, |
||
997 | int32_t src_stride, |
||
998 | int16_t *src1_ptr, |
||
999 | int32_t src2_stride, |
||
1000 | uint8_t *dst, |
||
1001 | int32_t dst_stride, |
||
1002 | const int8_t *filter, |
||
1003 | int32_t height) |
||
1004 | { |
||
1005 | int32_t loop_cnt; |
||
1006 | v16i8 src0, src1, src2, src3, src4, src5; |
||
1007 | v16i8 src6, src7, src8, src9, src10; |
||
1008 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
1009 | v16i8 src11, src12, src13, src14; |
||
1010 | v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; |
||
1011 | v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; |
||
1012 | v16i8 src1110_r, src1211_r, src1312_r, src1413_r; |
||
1013 | v16i8 src2110, src4332, src6554, src8776, src10998; |
||
1014 | v16i8 src12111110, src14131312; |
||
1015 | v8i16 dst10, dst32, dst54, dst76; |
||
1016 | v8i16 filt0, filt1, filt2, filt3; |
||
1017 | v8i16 filter_vec, const_vec; |
||
1018 | |||
1019 | src0_ptr -= (3 * src_stride); |
||
1020 | |||
1021 | const_vec = __msa_ldi_h(128); |
||
1022 | const_vec <<= 6; |
||
1023 | |||
1024 | filter_vec = LD_SH(filter); |
||
1025 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
1026 | |||
1027 | LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); |
||
1028 | src0_ptr += (7 * src_stride); |
||
1029 | ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, |
||
1030 | src10_r, src32_r, src54_r, src21_r); |
||
1031 | ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); |
||
1032 | ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, |
||
1033 | src2110, src4332, src6554); |
||
1034 | XORI_B3_128_SB(src2110, src4332, src6554); |
||
1035 | |||
1036 | for (loop_cnt = (height >> 3); loop_cnt--;) { |
||
1037 | LD_SB8(src0_ptr, src_stride, |
||
1038 | src7, src8, src9, src10, src11, src12, src13, src14); |
||
1039 | src0_ptr += (8 * src_stride); |
||
1040 | LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); |
||
1041 | src1_ptr += (8 * src2_stride); |
||
1042 | |||
1043 | ILVR_D2_SH(in1, in0, in3, in2, in0, in1); |
||
1044 | ILVR_D2_SH(in5, in4, in7, in6, in2, in3); |
||
1045 | ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, |
||
1046 | src76_r, src87_r, src98_r, src109_r); |
||
1047 | ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, |
||
1048 | src1110_r, src1211_r, src1312_r, src1413_r); |
||
1049 | ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r, |
||
1050 | src1413_r, src1312_r, |
||
1051 | src8776, src10998, src12111110, src14131312); |
||
1052 | XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); |
||
1053 | |||
1054 | dst10 = const_vec; |
||
1055 | DPADD_SB4_SH(src2110, src4332, src6554, src8776, |
||
1056 | filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10); |
||
1057 | dst32 = const_vec; |
||
1058 | DPADD_SB4_SH(src4332, src6554, src8776, src10998, |
||
1059 | filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32); |
||
1060 | dst54 = const_vec; |
||
1061 | DPADD_SB4_SH(src6554, src8776, src10998, src12111110, |
||
1062 | filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54); |
||
1063 | dst76 = const_vec; |
||
1064 | DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, |
||
1065 | filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76); |
||
1066 | |||
1067 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
1068 | dst10, dst32, dst54, dst76, 7, |
||
1069 | dst10, dst32, dst54, dst76); |
||
1070 | |||
1071 | PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54); |
||
1072 | ST4x8_UB(dst10, dst54, dst, dst_stride); |
||
1073 | dst += (8 * dst_stride); |
||
1074 | |||
1075 | src2110 = src10998; |
||
1076 | src4332 = src12111110; |
||
1077 | src6554 = src14131312; |
||
1078 | src6 = src14; |
||
1079 | } |
||
1080 | } |
||
1081 | |||
1082 | static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr, |
||
1083 | int32_t src_stride, |
||
1084 | int16_t *src1_ptr, |
||
1085 | int32_t src2_stride, |
||
1086 | uint8_t *dst, |
||
1087 | int32_t dst_stride, |
||
1088 | const int8_t *filter, |
||
1089 | int32_t height) |
||
1090 | { |
||
1091 | int32_t loop_cnt; |
||
1092 | v16i8 src0, src1, src2, src3, src4, src5; |
||
1093 | v16i8 src6, src7, src8, src9, src10; |
||
1094 | v8i16 in0, in1, in2, in3; |
||
1095 | v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; |
||
1096 | v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; |
||
1097 | v8i16 dst0_r, dst1_r, dst2_r, dst3_r; |
||
1098 | v8i16 filt0, filt1, filt2, filt3; |
||
1099 | v8i16 filter_vec, const_vec; |
||
1100 | |||
1101 | src0_ptr -= (3 * src_stride); |
||
1102 | const_vec = __msa_ldi_h(128); |
||
1103 | const_vec <<= 6; |
||
1104 | |||
1105 | filter_vec = LD_SH(filter); |
||
1106 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
1107 | |||
1108 | LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); |
||
1109 | src0_ptr += (7 * src_stride); |
||
1110 | XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); |
||
1111 | ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, |
||
1112 | src10_r, src32_r, src54_r, src21_r); |
||
1113 | ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); |
||
1114 | |||
1115 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
1116 | LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); |
||
1117 | src0_ptr += (4 * src_stride); |
||
1118 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
1119 | src1_ptr += (4 * src2_stride); |
||
1120 | XORI_B4_128_SB(src7, src8, src9, src10); |
||
1121 | ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, |
||
1122 | src76_r, src87_r, src98_r, src109_r); |
||
1123 | |||
1124 | dst0_r = const_vec; |
||
1125 | DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, |
||
1126 | filt0, filt1, filt2, filt3, |
||
1127 | dst0_r, dst0_r, dst0_r, dst0_r); |
||
1128 | dst1_r = const_vec; |
||
1129 | DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, |
||
1130 | filt0, filt1, filt2, filt3, |
||
1131 | dst1_r, dst1_r, dst1_r, dst1_r); |
||
1132 | dst2_r = const_vec; |
||
1133 | DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, |
||
1134 | filt0, filt1, filt2, filt3, |
||
1135 | dst2_r, dst2_r, dst2_r, dst2_r); |
||
1136 | dst3_r = const_vec; |
||
1137 | DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, |
||
1138 | filt0, filt1, filt2, filt3, |
||
1139 | dst3_r, dst3_r, dst3_r, dst3_r); |
||
1140 | |||
1141 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
1142 | dst0_r, dst1_r, dst2_r, dst3_r, 7, |
||
1143 | dst0_r, dst1_r, dst2_r, dst3_r); |
||
1144 | |||
1145 | PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); |
||
1146 | ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); |
||
1147 | dst += (4 * dst_stride); |
||
1148 | |||
1149 | src10_r = src54_r; |
||
1150 | src32_r = src76_r; |
||
1151 | src54_r = src98_r; |
||
1152 | src21_r = src65_r; |
||
1153 | src43_r = src87_r; |
||
1154 | src65_r = src109_r; |
||
1155 | |||
1156 | src6 = src10; |
||
1157 | } |
||
1158 | } |
||
1159 | |||
1160 | static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr, |
||
1161 | int32_t src_stride, |
||
1162 | int16_t *src1_ptr, |
||
1163 | int32_t src2_stride, |
||
1164 | uint8_t *dst, |
||
1165 | int32_t dst_stride, |
||
1166 | const int8_t *filter, |
||
1167 | int32_t height) |
||
1168 | { |
||
1169 | int32_t loop_cnt; |
||
1170 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; |
||
1171 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
1172 | v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; |
||
1173 | v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; |
||
1174 | v8i16 dst0_r, dst1_r, dst2_r, dst3_r; |
||
1175 | v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; |
||
1176 | v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; |
||
1177 | v16i8 src2110, src4332, src6554, src8776, src10998; |
||
1178 | v8i16 dst0_l, dst1_l; |
||
1179 | v8i16 filt0, filt1, filt2, filt3; |
||
1180 | v8i16 filter_vec, const_vec; |
||
1181 | |||
1182 | src0_ptr -= (3 * src_stride); |
||
1183 | const_vec = __msa_ldi_h(128); |
||
1184 | const_vec <<= 6; |
||
1185 | |||
1186 | filter_vec = LD_SH(filter); |
||
1187 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
1188 | |||
1189 | LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); |
||
1190 | src0_ptr += (7 * src_stride); |
||
1191 | XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); |
||
1192 | |||
1193 | ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, |
||
1194 | src10_r, src32_r, src54_r, src21_r); |
||
1195 | ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); |
||
1196 | ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, |
||
1197 | src10_l, src32_l, src54_l, src21_l); |
||
1198 | ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); |
||
1199 | ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, |
||
1200 | src2110, src4332, src6554); |
||
1201 | |||
1202 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
1203 | LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); |
||
1204 | src0_ptr += (4 * src_stride); |
||
1205 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
1206 | LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7); |
||
1207 | src1_ptr += (4 * src2_stride); |
||
1208 | |||
1209 | ILVR_D2_SH(in5, in4, in7, in6, in4, in5); |
||
1210 | XORI_B4_128_SB(src7, src8, src9, src10); |
||
1211 | ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, |
||
1212 | src76_r, src87_r, src98_r, src109_r); |
||
1213 | ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, |
||
1214 | src76_l, src87_l, src98_l, src109_l); |
||
1215 | ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998); |
||
1216 | |||
1217 | dst0_r = const_vec; |
||
1218 | DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, |
||
1219 | filt0, filt1, filt2, filt3, |
||
1220 | dst0_r, dst0_r, dst0_r, dst0_r); |
||
1221 | dst1_r = const_vec; |
||
1222 | DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, |
||
1223 | filt0, filt1, filt2, filt3, |
||
1224 | dst1_r, dst1_r, dst1_r, dst1_r); |
||
1225 | dst2_r = const_vec; |
||
1226 | DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, |
||
1227 | filt0, filt1, filt2, filt3, |
||
1228 | dst2_r, dst2_r, dst2_r, dst2_r); |
||
1229 | dst3_r = const_vec; |
||
1230 | DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, |
||
1231 | filt0, filt1, filt2, filt3, |
||
1232 | dst3_r, dst3_r, dst3_r, dst3_r); |
||
1233 | dst0_l = const_vec; |
||
1234 | DPADD_SB4_SH(src2110, src4332, src6554, src8776, |
||
1235 | filt0, filt1, filt2, filt3, |
||
1236 | dst0_l, dst0_l, dst0_l, dst0_l); |
||
1237 | dst1_l = const_vec; |
||
1238 | DPADD_SB4_SH(src4332, src6554, src8776, src10998, |
||
1239 | filt0, filt1, filt2, filt3, |
||
1240 | dst1_l, dst1_l, dst1_l, dst1_l); |
||
1241 | |||
1242 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
1243 | dst0_r, dst1_r, dst2_r, dst3_r, 7, |
||
1244 | dst0_r, dst1_r, dst2_r, dst3_r); |
||
1245 | HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l); |
||
1246 | |||
1247 | |||
1248 | PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); |
||
1249 | dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l); |
||
1250 | ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride); |
||
1251 | dst += (4 * dst_stride); |
||
1252 | |||
1253 | src10_r = src54_r; |
||
1254 | src32_r = src76_r; |
||
1255 | src54_r = src98_r; |
||
1256 | src21_r = src65_r; |
||
1257 | src43_r = src87_r; |
||
1258 | src65_r = src109_r; |
||
1259 | src2110 = src6554; |
||
1260 | src4332 = src8776; |
||
1261 | src6554 = src10998; |
||
1262 | src6 = src10; |
||
1263 | } |
||
1264 | } |
||
1265 | |||
1266 | static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr, |
||
1267 | int32_t src_stride, |
||
1268 | int16_t *src1_ptr, |
||
1269 | int32_t src2_stride, |
||
1270 | uint8_t *dst, |
||
1271 | int32_t dst_stride, |
||
1272 | const int8_t *filter, |
||
1273 | int32_t height, int32_t width) |
||
1274 | { |
||
1275 | uint8_t *src0_ptr_tmp; |
||
1276 | int16_t *src1_ptr_tmp; |
||
1277 | uint8_t *dst_tmp; |
||
1278 | uint32_t loop_cnt; |
||
1279 | uint32_t cnt; |
||
1280 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; |
||
1281 | v8i16 in0, in1, in2, in3; |
||
1282 | v16i8 src10_r, src32_r, src54_r, src76_r; |
||
1283 | v16i8 src21_r, src43_r, src65_r, src87_r; |
||
1284 | v8i16 dst0_r, dst1_r; |
||
1285 | v16i8 src10_l, src32_l, src54_l, src76_l; |
||
1286 | v16i8 src21_l, src43_l, src65_l, src87_l; |
||
1287 | v8i16 dst0_l, dst1_l; |
||
1288 | v8i16 filt0, filt1, filt2, filt3; |
||
1289 | v8i16 filter_vec, const_vec; |
||
1290 | |||
1291 | src0_ptr -= (3 * src_stride); |
||
1292 | const_vec = __msa_ldi_h(128); |
||
1293 | const_vec <<= 6; |
||
1294 | |||
1295 | filter_vec = LD_SH(filter); |
||
1296 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
1297 | |||
1298 | for (cnt = (width >> 4); cnt--;) { |
||
1299 | src0_ptr_tmp = src0_ptr; |
||
1300 | src1_ptr_tmp = src1_ptr; |
||
1301 | dst_tmp = dst; |
||
1302 | |||
1303 | LD_SB7(src0_ptr_tmp, src_stride, |
||
1304 | src0, src1, src2, src3, src4, src5, src6); |
||
1305 | src0_ptr_tmp += (7 * src_stride); |
||
1306 | XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); |
||
1307 | |||
1308 | ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, |
||
1309 | src10_r, src32_r, src54_r, src21_r); |
||
1310 | ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); |
||
1311 | ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, |
||
1312 | src10_l, src32_l, src54_l, src21_l); |
||
1313 | ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); |
||
1314 | |||
1315 | for (loop_cnt = (height >> 1); loop_cnt--;) { |
||
1316 | LD_SB2(src0_ptr_tmp, src_stride, src7, src8); |
||
1317 | src0_ptr_tmp += (2 * src_stride); |
||
1318 | LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); |
||
1319 | LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3); |
||
1320 | src1_ptr_tmp += (2 * src2_stride); |
||
1321 | XORI_B2_128_SB(src7, src8); |
||
1322 | |||
1323 | ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); |
||
1324 | ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); |
||
1325 | |||
1326 | dst0_r = const_vec; |
||
1327 | DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, |
||
1328 | filt0, filt1, filt2, filt3, |
||
1329 | dst0_r, dst0_r, dst0_r, dst0_r); |
||
1330 | dst1_r = const_vec; |
||
1331 | DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, |
||
1332 | filt0, filt1, filt2, filt3, |
||
1333 | dst1_r, dst1_r, dst1_r, dst1_r); |
||
1334 | dst0_l = const_vec; |
||
1335 | DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l, |
||
1336 | filt0, filt1, filt2, filt3, |
||
1337 | dst0_l, dst0_l, dst0_l, dst0_l); |
||
1338 | dst1_l = const_vec; |
||
1339 | DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l, |
||
1340 | filt0, filt1, filt2, filt3, |
||
1341 | dst1_l, dst1_l, dst1_l, dst1_l); |
||
1342 | |||
1343 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
1344 | dst0_r, dst1_r, dst0_l, dst1_l, 7, |
||
1345 | dst0_r, dst1_r, dst0_l, dst1_l); |
||
1346 | |||
1347 | PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); |
||
1348 | ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride); |
||
1349 | dst_tmp += (2 * dst_stride); |
||
1350 | |||
1351 | src10_r = src32_r; |
||
1352 | src32_r = src54_r; |
||
1353 | src54_r = src76_r; |
||
1354 | src21_r = src43_r; |
||
1355 | src43_r = src65_r; |
||
1356 | src65_r = src87_r; |
||
1357 | src10_l = src32_l; |
||
1358 | src32_l = src54_l; |
||
1359 | src54_l = src76_l; |
||
1360 | src21_l = src43_l; |
||
1361 | src43_l = src65_l; |
||
1362 | src65_l = src87_l; |
||
1363 | src6 = src8; |
||
1364 | } |
||
1365 | |||
1366 | src0_ptr += 16; |
||
1367 | src1_ptr += 16; |
||
1368 | dst += 16; |
||
1369 | } |
||
1370 | } |
||
1371 | |||
1372 | static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr, |
||
1373 | int32_t src_stride, |
||
1374 | int16_t *src1_ptr, |
||
1375 | int32_t src2_stride, |
||
1376 | uint8_t *dst, |
||
1377 | int32_t dst_stride, |
||
1378 | const int8_t *filter, |
||
1379 | int32_t height) |
||
1380 | { |
||
1381 | hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1382 | dst, dst_stride, filter, height, 16); |
||
1383 | } |
||
1384 | |||
1385 | static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr, |
||
1386 | int32_t src_stride, |
||
1387 | int16_t *src1_ptr, |
||
1388 | int32_t src2_stride, |
||
1389 | uint8_t *dst, |
||
1390 | int32_t dst_stride, |
||
1391 | const int8_t *filter, |
||
1392 | int32_t height) |
||
1393 | { |
||
1394 | hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1395 | dst, dst_stride, filter, height, 16); |
||
1396 | hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, |
||
1397 | dst + 16, dst_stride, filter, height); |
||
1398 | } |
||
1399 | |||
1400 | static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr, |
||
1401 | int32_t src_stride, |
||
1402 | int16_t *src1_ptr, |
||
1403 | int32_t src2_stride, |
||
1404 | uint8_t *dst, |
||
1405 | int32_t dst_stride, |
||
1406 | const int8_t *filter, |
||
1407 | int32_t height) |
||
1408 | { |
||
1409 | hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1410 | dst, dst_stride, filter, height, 32); |
||
1411 | } |
||
1412 | |||
1413 | static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr, |
||
1414 | int32_t src_stride, |
||
1415 | int16_t *src1_ptr, |
||
1416 | int32_t src2_stride, |
||
1417 | uint8_t *dst, |
||
1418 | int32_t dst_stride, |
||
1419 | const int8_t *filter, |
||
1420 | int32_t height) |
||
1421 | { |
||
1422 | hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1423 | dst, dst_stride, filter, height, 48); |
||
1424 | } |
||
1425 | |||
1426 | static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr, |
||
1427 | int32_t src_stride, |
||
1428 | int16_t *src1_ptr, |
||
1429 | int32_t src2_stride, |
||
1430 | uint8_t *dst, |
||
1431 | int32_t dst_stride, |
||
1432 | const int8_t *filter, |
||
1433 | int32_t height) |
||
1434 | { |
||
1435 | hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1436 | dst, dst_stride, filter, height, 64); |
||
1437 | } |
||
1438 | |||
1439 | static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr, |
||
1440 | int32_t src_stride, |
||
1441 | int16_t *src1_ptr, |
||
1442 | int32_t src2_stride, |
||
1443 | uint8_t *dst, |
||
1444 | int32_t dst_stride, |
||
1445 | const int8_t *filter_x, |
||
1446 | const int8_t *filter_y, |
||
1447 | int32_t height) |
||
1448 | { |
||
1449 | uint32_t loop_cnt; |
||
1450 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; |
||
1451 | v8i16 in0, in1; |
||
1452 | v8i16 filt0, filt1, filt2, filt3; |
||
1453 | v4i32 filt_h0, filt_h1, filt_h2, filt_h3; |
||
1454 | v16i8 mask1, mask2, mask3; |
||
1455 | v8i16 filter_vec, const_vec; |
||
1456 | v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
||
1457 | v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; |
||
1458 | v8i16 dst30, dst41, dst52, dst63, dst66, dst87; |
||
1459 | v4i32 dst0_r, dst1_r, in0_r, in0_l; |
||
1460 | v8i16 dst10_r, dst32_r, dst54_r, dst76_r; |
||
1461 | v8i16 dst21_r, dst43_r, dst65_r, dst87_r; |
||
1462 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; |
||
1463 | v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 }; |
||
1464 | |||
1465 | src0_ptr -= ((3 * src_stride) + 3); |
||
1466 | filter_vec = LD_SH(filter_x); |
||
1467 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
1468 | |||
1469 | filter_vec = LD_SH(filter_y); |
||
1470 | vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); |
||
1471 | filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); |
||
1472 | |||
1473 | SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); |
||
1474 | |||
1475 | mask1 = mask0 + 2; |
||
1476 | mask2 = mask0 + 4; |
||
1477 | mask3 = mask0 + 6; |
||
1478 | |||
1479 | const_vec = __msa_ldi_h(128); |
||
1480 | const_vec <<= 6; |
||
1481 | |||
1482 | LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); |
||
1483 | src0_ptr += (7 * src_stride); |
||
1484 | XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); |
||
1485 | |||
1486 | /* row 0 row 1 row 2 row 3 */ |
||
1487 | VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); |
||
1488 | VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); |
||
1489 | VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, |
||
1490 | vec8, vec9, vec10, vec11); |
||
1491 | VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, |
||
1492 | vec12, vec13, vec14, vec15); |
||
1493 | |||
1494 | dst30 = const_vec; |
||
1495 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
1496 | dst30, dst30, dst30, dst30); |
||
1497 | dst41 = const_vec; |
||
1498 | DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, |
||
1499 | dst41, dst41, dst41, dst41); |
||
1500 | dst52 = const_vec; |
||
1501 | DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, |
||
1502 | dst52, dst52, dst52, dst52); |
||
1503 | dst63 = const_vec; |
||
1504 | DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, |
||
1505 | dst63, dst63, dst63, dst63); |
||
1506 | |||
1507 | ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52, |
||
1508 | dst10_r, dst21_r, dst32_r); |
||
1509 | dst43_r = __msa_ilvl_h(dst41, dst30); |
||
1510 | dst54_r = __msa_ilvl_h(dst52, dst41); |
||
1511 | dst65_r = __msa_ilvl_h(dst63, dst52); |
||
1512 | dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); |
||
1513 | |||
1514 | for (loop_cnt = height >> 1; loop_cnt--;) { |
||
1515 | LD_SB2(src0_ptr, src_stride, src7, src8); |
||
1516 | src0_ptr += (2 * src_stride); |
||
1517 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
1518 | src1_ptr += (2 * src2_stride); |
||
1519 | |||
1520 | in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); |
||
1521 | XORI_B2_128_SB(src7, src8); |
||
1522 | |||
1523 | VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3, |
||
1524 | vec0, vec1, vec2, vec3); |
||
1525 | dst87 = const_vec; |
||
1526 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
1527 | dst87, dst87, dst87, dst87); |
||
1528 | dst76_r = __msa_ilvr_h(dst87, dst66); |
||
1529 | dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, |
||
1530 | filt_h0, filt_h1, filt_h2, filt_h3); |
||
1531 | dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87); |
||
1532 | dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, |
||
1533 | filt_h0, filt_h1, filt_h2, filt_h3); |
||
1534 | |||
1535 | dst0_r >>= 6; |
||
1536 | dst1_r >>= 6; |
||
1537 | UNPCK_SH_SW(in0, in0_r, in0_l); |
||
1538 | dst0_r = __msa_adds_s_w(dst0_r, in0_r); |
||
1539 | dst1_r = __msa_adds_s_w(dst1_r, in0_l); |
||
1540 | SRARI_W2_SW(dst0_r, dst1_r, 7); |
||
1541 | dst0_r = CLIP_SW_0_255(dst0_r); |
||
1542 | dst1_r = CLIP_SW_0_255(dst1_r); |
||
1543 | |||
1544 | HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r); |
||
1545 | ST4x2_UB(dst0_r, dst, dst_stride); |
||
1546 | dst += (2 * dst_stride); |
||
1547 | |||
1548 | dst10_r = dst32_r; |
||
1549 | dst32_r = dst54_r; |
||
1550 | dst54_r = dst76_r; |
||
1551 | dst21_r = dst43_r; |
||
1552 | dst43_r = dst65_r; |
||
1553 | dst65_r = dst87_r; |
||
1554 | dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1); |
||
1555 | } |
||
1556 | } |
||
1557 | |||
1558 | static void hevc_hv_bi_8t_8multx2mult_msa(uint8_t *src0_ptr, |
||
1559 | int32_t src_stride, |
||
1560 | int16_t *src1_ptr, |
||
1561 | int32_t src2_stride, |
||
1562 | uint8_t *dst, |
||
1563 | int32_t dst_stride, |
||
1564 | const int8_t *filter_x, |
||
1565 | const int8_t *filter_y, |
||
1566 | int32_t height, int32_t width) |
||
1567 | { |
||
1568 | uint32_t loop_cnt; |
||
1569 | uint32_t cnt; |
||
1570 | uint8_t *src0_ptr_tmp; |
||
1571 | int16_t *src1_ptr_tmp; |
||
1572 | uint8_t *dst_tmp; |
||
1573 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; |
||
1574 | v8i16 in0, in1; |
||
1575 | v4i32 in0_r, in0_l, in1_r, in1_l; |
||
1576 | v8i16 filt0, filt1, filt2, filt3; |
||
1577 | v4i32 filt_h0, filt_h1, filt_h2, filt_h3; |
||
1578 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
1579 | v16i8 mask1, mask2, mask3; |
||
1580 | v8i16 filter_vec, const_vec; |
||
1581 | v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
||
1582 | v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; |
||
1583 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; |
||
1584 | v4i32 dst0_r, dst0_l, dst1_r, dst1_l; |
||
1585 | v8i16 dst10_r, dst32_r, dst54_r, dst76_r; |
||
1586 | v8i16 dst10_l, dst32_l, dst54_l, dst76_l; |
||
1587 | v8i16 dst21_r, dst43_r, dst65_r, dst87_r; |
||
1588 | v8i16 dst21_l, dst43_l, dst65_l, dst87_l; |
||
1589 | |||
1590 | src0_ptr -= ((3 * src_stride) + 3); |
||
1591 | const_vec = __msa_ldi_h(128); |
||
1592 | const_vec <<= 6; |
||
1593 | |||
1594 | filter_vec = LD_SH(filter_x); |
||
1595 | SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); |
||
1596 | |||
1597 | filter_vec = LD_SH(filter_y); |
||
1598 | vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); |
||
1599 | filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); |
||
1600 | SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); |
||
1601 | |||
1602 | mask1 = mask0 + 2; |
||
1603 | mask2 = mask0 + 4; |
||
1604 | mask3 = mask0 + 6; |
||
1605 | |||
1606 | for (cnt = width >> 3; cnt--;) { |
||
1607 | src0_ptr_tmp = src0_ptr; |
||
1608 | dst_tmp = dst; |
||
1609 | src1_ptr_tmp = src1_ptr; |
||
1610 | |||
1611 | LD_SB7(src0_ptr_tmp, src_stride, |
||
1612 | src0, src1, src2, src3, src4, src5, src6); |
||
1613 | src0_ptr_tmp += (7 * src_stride); |
||
1614 | XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); |
||
1615 | |||
1616 | /* row 0 row 1 row 2 row 3 */ |
||
1617 | VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, |
||
1618 | vec0, vec1, vec2, vec3); |
||
1619 | VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, |
||
1620 | vec4, vec5, vec6, vec7); |
||
1621 | VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, |
||
1622 | vec8, vec9, vec10, vec11); |
||
1623 | VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, |
||
1624 | vec12, vec13, vec14, vec15); |
||
1625 | dst0 = const_vec; |
||
1626 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
1627 | dst0, dst0, dst0, dst0); |
||
1628 | dst1 = const_vec; |
||
1629 | DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, |
||
1630 | dst1, dst1, dst1, dst1); |
||
1631 | dst2 = const_vec; |
||
1632 | DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, |
||
1633 | dst2, dst2, dst2, dst2); |
||
1634 | dst3 = const_vec; |
||
1635 | DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, |
||
1636 | dst3, dst3, dst3, dst3); |
||
1637 | |||
1638 | VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, |
||
1639 | vec0, vec1, vec2, vec3); |
||
1640 | VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, |
||
1641 | vec4, vec5, vec6, vec7); |
||
1642 | VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, |
||
1643 | vec8, vec9, vec10, vec11); |
||
1644 | dst4 = const_vec; |
||
1645 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
1646 | dst4, dst4, dst4, dst4); |
||
1647 | dst5 = const_vec; |
||
1648 | DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, |
||
1649 | dst5, dst5, dst5, dst5); |
||
1650 | dst6 = const_vec; |
||
1651 | DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, |
||
1652 | dst6, dst6, dst6, dst6); |
||
1653 | |||
1654 | ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, |
||
1655 | dst10_r, dst32_r, dst54_r, dst21_r); |
||
1656 | ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); |
||
1657 | ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, |
||
1658 | dst10_l, dst32_l, dst54_l, dst21_l); |
||
1659 | ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); |
||
1660 | |||
1661 | for (loop_cnt = height >> 1; loop_cnt--;) { |
||
1662 | /* row 7 */ |
||
1663 | LD_SB2(src0_ptr_tmp, src_stride, src7, src8); |
||
1664 | XORI_B2_128_SB(src7, src8); |
||
1665 | src0_ptr_tmp += 2 * src_stride; |
||
1666 | |||
1667 | LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); |
||
1668 | src1_ptr_tmp += (2 * src2_stride); |
||
1669 | |||
1670 | VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, |
||
1671 | vec0, vec1, vec2, vec3); |
||
1672 | dst7 = const_vec; |
||
1673 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
1674 | dst7, dst7, dst7, dst7); |
||
1675 | |||
1676 | ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); |
||
1677 | dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, |
||
1678 | filt_h0, filt_h1, filt_h2, filt_h3); |
||
1679 | dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, |
||
1680 | filt_h0, filt_h1, filt_h2, filt_h3); |
||
1681 | dst0_r >>= 6; |
||
1682 | dst0_l >>= 6; |
||
1683 | |||
1684 | VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, |
||
1685 | vec0, vec1, vec2, vec3); |
||
1686 | dst8 = const_vec; |
||
1687 | DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, |
||
1688 | dst8, dst8, dst8, dst8); |
||
1689 | |||
1690 | ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); |
||
1691 | dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, |
||
1692 | filt_h0, filt_h1, filt_h2, filt_h3); |
||
1693 | dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, |
||
1694 | filt_h0, filt_h1, filt_h2, filt_h3); |
||
1695 | dst1_r >>= 6; |
||
1696 | dst1_l >>= 6; |
||
1697 | |||
1698 | UNPCK_SH_SW(in0, in0_r, in0_l); |
||
1699 | UNPCK_SH_SW(in1, in1_r, in1_l); |
||
1700 | in0_r = __msa_adds_s_w(in0_r, dst0_r); |
||
1701 | in0_l = __msa_adds_s_w(in0_l, dst0_l); |
||
1702 | in1_r = __msa_adds_s_w(in1_r, dst1_r); |
||
1703 | in1_l = __msa_adds_s_w(in1_l, dst1_l); |
||
1704 | SRARI_W4_SW(in0_r, in0_l, in1_r, in1_l, 7); |
||
1705 | in0_r = CLIP_SW_0_255(in0_r); |
||
1706 | in0_l = CLIP_SW_0_255(in0_l); |
||
1707 | in1_r = CLIP_SW_0_255(in1_r); |
||
1708 | in1_l = CLIP_SW_0_255(in1_l); |
||
1709 | |||
1710 | HEVC_PCK_SW_SB4(in0_l, in0_r, in1_l, in1_r, dst0_r); |
||
1711 | ST8x2_UB(dst0_r, dst_tmp, dst_stride); |
||
1712 | dst_tmp += (2 * dst_stride); |
||
1713 | |||
1714 | dst10_r = dst32_r; |
||
1715 | dst32_r = dst54_r; |
||
1716 | dst54_r = dst76_r; |
||
1717 | dst10_l = dst32_l; |
||
1718 | dst32_l = dst54_l; |
||
1719 | dst54_l = dst76_l; |
||
1720 | dst21_r = dst43_r; |
||
1721 | dst43_r = dst65_r; |
||
1722 | dst65_r = dst87_r; |
||
1723 | dst21_l = dst43_l; |
||
1724 | dst43_l = dst65_l; |
||
1725 | dst65_l = dst87_l; |
||
1726 | dst6 = dst8; |
||
1727 | } |
||
1728 | |||
1729 | src0_ptr += 8; |
||
1730 | dst += 8; |
||
1731 | src1_ptr += 8; |
||
1732 | } |
||
1733 | } |
||
1734 | |||
1735 | static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr, |
||
1736 | int32_t src_stride, |
||
1737 | int16_t *src1_ptr, |
||
1738 | int32_t src2_stride, |
||
1739 | uint8_t *dst, |
||
1740 | int32_t dst_stride, |
||
1741 | const int8_t *filter_x, |
||
1742 | const int8_t *filter_y, |
||
1743 | int32_t height) |
||
1744 | { |
||
1745 | hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1746 | dst, dst_stride, filter_x, filter_y, |
||
1747 | height, 8); |
||
1748 | } |
||
1749 | |||
1750 | static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr, |
||
1751 | int32_t src_stride, |
||
1752 | int16_t *src1_ptr, |
||
1753 | int32_t src2_stride, |
||
1754 | uint8_t *dst, |
||
1755 | int32_t dst_stride, |
||
1756 | const int8_t *filter_x, |
||
1757 | const int8_t *filter_y, |
||
1758 | int32_t height) |
||
1759 | { |
||
1760 | hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1761 | dst, dst_stride, filter_x, filter_y, |
||
1762 | height, 8); |
||
1763 | |||
1764 | hevc_hv_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride, |
||
1765 | dst + 8, dst_stride, filter_x, filter_y, height); |
||
1766 | } |
||
1767 | |||
1768 | static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr, |
||
1769 | int32_t src_stride, |
||
1770 | int16_t *src1_ptr, |
||
1771 | int32_t src2_stride, |
||
1772 | uint8_t *dst, |
||
1773 | int32_t dst_stride, |
||
1774 | const int8_t *filter_x, |
||
1775 | const int8_t *filter_y, |
||
1776 | int32_t height) |
||
1777 | { |
||
1778 | hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1779 | dst, dst_stride, filter_x, filter_y, |
||
1780 | height, 16); |
||
1781 | } |
||
1782 | |||
1783 | static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr, |
||
1784 | int32_t src_stride, |
||
1785 | int16_t *src1_ptr, |
||
1786 | int32_t src2_stride, |
||
1787 | uint8_t *dst, |
||
1788 | int32_t dst_stride, |
||
1789 | const int8_t *filter_x, |
||
1790 | const int8_t *filter_y, |
||
1791 | int32_t height) |
||
1792 | { |
||
1793 | hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1794 | dst, dst_stride, filter_x, filter_y, |
||
1795 | height, 24); |
||
1796 | } |
||
1797 | |||
1798 | static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr, |
||
1799 | int32_t src_stride, |
||
1800 | int16_t *src1_ptr, |
||
1801 | int32_t src2_stride, |
||
1802 | uint8_t *dst, |
||
1803 | int32_t dst_stride, |
||
1804 | const int8_t *filter_x, |
||
1805 | const int8_t *filter_y, |
||
1806 | int32_t height) |
||
1807 | { |
||
1808 | hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1809 | dst, dst_stride, filter_x, filter_y, |
||
1810 | height, 32); |
||
1811 | } |
||
1812 | |||
1813 | static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr, |
||
1814 | int32_t src_stride, |
||
1815 | int16_t *src1_ptr, |
||
1816 | int32_t src2_stride, |
||
1817 | uint8_t *dst, |
||
1818 | int32_t dst_stride, |
||
1819 | const int8_t *filter_x, |
||
1820 | const int8_t *filter_y, |
||
1821 | int32_t height) |
||
1822 | { |
||
1823 | hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1824 | dst, dst_stride, filter_x, filter_y, |
||
1825 | height, 48); |
||
1826 | } |
||
1827 | |||
1828 | static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr, |
||
1829 | int32_t src_stride, |
||
1830 | int16_t *src1_ptr, |
||
1831 | int32_t src2_stride, |
||
1832 | uint8_t *dst, |
||
1833 | int32_t dst_stride, |
||
1834 | const int8_t *filter_x, |
||
1835 | const int8_t *filter_y, |
||
1836 | int32_t height) |
||
1837 | { |
||
1838 | hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
1839 | dst, dst_stride, filter_x, filter_y, |
||
1840 | height, 64); |
||
1841 | } |
||
1842 | |||
1843 | static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr, |
||
1844 | int32_t src_stride, |
||
1845 | int16_t *src1_ptr, |
||
1846 | int32_t src2_stride, |
||
1847 | uint8_t *dst, |
||
1848 | int32_t dst_stride, |
||
1849 | const int8_t *filter, |
||
1850 | int32_t height) |
||
1851 | { |
||
1852 | v8i16 filt0, filt1; |
||
1853 | v16i8 src0, src1, dst0, vec0, vec1; |
||
1854 | v8i16 in0, in1; |
||
1855 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; |
||
1856 | v16i8 mask1; |
||
1857 | v8i16 tmp0; |
||
1858 | v8i16 filter_vec, const_vec; |
||
1859 | |||
1860 | src0_ptr -= 1; |
||
1861 | |||
1862 | const_vec = __msa_ldi_h(128); |
||
1863 | const_vec <<= 6; |
||
1864 | |||
1865 | filter_vec = LD_SH(filter); |
||
1866 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
1867 | |||
1868 | mask1 = mask0 + 2; |
||
1869 | |||
1870 | LD_SB2(src0_ptr, src_stride, src0, src1); |
||
1871 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
1872 | in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); |
||
1873 | XORI_B2_128_SB(src0, src1); |
||
1874 | VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); |
||
1875 | tmp0 = const_vec; |
||
1876 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0); |
||
1877 | |||
1878 | tmp0 = __msa_adds_s_h(tmp0, in0); |
||
1879 | tmp0 = __msa_srari_h(tmp0, 7); |
||
1880 | tmp0 = CLIP_SH_0_255(tmp0); |
||
1881 | dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); |
||
1882 | |||
1883 | ST4x2_UB(dst0, dst, dst_stride); |
||
1884 | } |
||
1885 | |||
1886 | static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr, |
||
1887 | int32_t src_stride, |
||
1888 | int16_t *src1_ptr, |
||
1889 | int32_t src2_stride, |
||
1890 | uint8_t *dst, |
||
1891 | int32_t dst_stride, |
||
1892 | const int8_t *filter, |
||
1893 | int32_t height) |
||
1894 | { |
||
1895 | v8i16 filt0, filt1; |
||
1896 | v16i8 src0, src1, src2, src3, dst0, vec0, vec1; |
||
1897 | v8i16 in0, in1, in2, in3; |
||
1898 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; |
||
1899 | v16i8 mask1; |
||
1900 | v8i16 tmp0, tmp1; |
||
1901 | v8i16 filter_vec, const_vec; |
||
1902 | |||
1903 | src0_ptr -= 1; |
||
1904 | |||
1905 | const_vec = __msa_ldi_h(128); |
||
1906 | const_vec <<= 6; |
||
1907 | |||
1908 | filter_vec = LD_SH(filter); |
||
1909 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
1910 | |||
1911 | mask1 = mask0 + 2; |
||
1912 | |||
1913 | LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); |
||
1914 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
1915 | |||
1916 | ILVR_D2_SH(in1, in0, in3, in2, in0, in1); |
||
1917 | XORI_B4_128_SB(src0, src1, src2, src3); |
||
1918 | |||
1919 | VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); |
||
1920 | tmp0 = const_vec; |
||
1921 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0); |
||
1922 | VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); |
||
1923 | tmp1 = const_vec; |
||
1924 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1); |
||
1925 | HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1); |
||
1926 | dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); |
||
1927 | |||
1928 | ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); |
||
1929 | } |
||
1930 | |||
1931 | static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, |
||
1932 | int32_t src_stride, |
||
1933 | int16_t *src1_ptr, |
||
1934 | int32_t src2_stride, |
||
1935 | uint8_t *dst, |
||
1936 | int32_t dst_stride, |
||
1937 | const int8_t *filter, |
||
1938 | int32_t height) |
||
1939 | { |
||
1940 | uint32_t loop_cnt; |
||
1941 | v8i16 filt0, filt1; |
||
1942 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7; |
||
1943 | v16i8 dst0, dst1; |
||
1944 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
1945 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; |
||
1946 | v16i8 mask1, vec0, vec1; |
||
1947 | v8i16 tmp0, tmp1, tmp2, tmp3; |
||
1948 | v8i16 filter_vec, const_vec; |
||
1949 | |||
1950 | src0_ptr -= 1; |
||
1951 | |||
1952 | const_vec = __msa_ldi_h(128); |
||
1953 | const_vec <<= 6; |
||
1954 | |||
1955 | filter_vec = LD_SH(filter); |
||
1956 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
1957 | |||
1958 | mask1 = mask0 + 2; |
||
1959 | |||
1960 | for (loop_cnt = (height >> 3); loop_cnt--;) { |
||
1961 | LD_SB8(src0_ptr, src_stride, |
||
1962 | src0, src1, src2, src3, src4, src5, src6, src7); |
||
1963 | src0_ptr += (8 * src_stride); |
||
1964 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
1965 | src1_ptr += (4 * src2_stride); |
||
1966 | LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7); |
||
1967 | src1_ptr += (4 * src2_stride); |
||
1968 | ILVR_D2_SH(in1, in0, in3, in2, in0, in1); |
||
1969 | ILVR_D2_SH(in5, in4, in7, in6, in2, in3); |
||
1970 | XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); |
||
1971 | |||
1972 | VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); |
||
1973 | tmp0 = const_vec; |
||
1974 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0); |
||
1975 | VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); |
||
1976 | tmp1 = const_vec; |
||
1977 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1); |
||
1978 | VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1); |
||
1979 | tmp2 = const_vec; |
||
1980 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp2, tmp2); |
||
1981 | VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1); |
||
1982 | tmp3 = const_vec; |
||
1983 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp3, tmp3); |
||
1984 | |||
1985 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
1986 | tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3); |
||
1987 | |||
1988 | PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); |
||
1989 | ST4x8_UB(dst0, dst1, dst, dst_stride); |
||
1990 | dst += (8 * dst_stride); |
||
1991 | } |
||
1992 | } |
||
1993 | |||
1994 | static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr, |
||
1995 | int32_t src_stride, |
||
1996 | int16_t *src1_ptr, |
||
1997 | int32_t src2_stride, |
||
1998 | uint8_t *dst, |
||
1999 | int32_t dst_stride, |
||
2000 | const int8_t *filter, |
||
2001 | int32_t height) |
||
2002 | { |
||
2003 | if (2 == height) { |
||
2004 | hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
2005 | dst, dst_stride, filter, height); |
||
2006 | } else if (4 == height) { |
||
2007 | hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
2008 | dst, dst_stride, filter, height); |
||
2009 | } else if (8 == height || 16 == height) { |
||
2010 | hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride, |
||
2011 | src1_ptr, src2_stride, |
||
2012 | dst, dst_stride, filter, height); |
||
2013 | } |
||
2014 | } |
||
2015 | |||
2016 | static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr, |
||
2017 | int32_t src_stride, |
||
2018 | int16_t *src1_ptr, |
||
2019 | int32_t src2_stride, |
||
2020 | uint8_t *dst, |
||
2021 | int32_t dst_stride, |
||
2022 | const int8_t *filter, |
||
2023 | int32_t height) |
||
2024 | { |
||
2025 | uint32_t loop_cnt; |
||
2026 | v8i16 filt0, filt1; |
||
2027 | v16i8 src0, src1, src2, src3; |
||
2028 | v8i16 in0, in1, in2, in3; |
||
2029 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
2030 | v16i8 mask1; |
||
2031 | v16i8 vec0, vec1; |
||
2032 | v8i16 dst0, dst1, dst2, dst3; |
||
2033 | v8i16 filter_vec, const_vec; |
||
2034 | |||
2035 | src0_ptr -= 1; |
||
2036 | |||
2037 | const_vec = __msa_ldi_h(128); |
||
2038 | const_vec <<= 6; |
||
2039 | |||
2040 | filter_vec = LD_SH(filter); |
||
2041 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2042 | |||
2043 | mask1 = mask0 + 2; |
||
2044 | |||
2045 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
2046 | LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); |
||
2047 | src0_ptr += (4 * src_stride); |
||
2048 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
2049 | src1_ptr += (4 * src2_stride); |
||
2050 | XORI_B4_128_SB(src0, src1, src2, src3); |
||
2051 | |||
2052 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
2053 | dst0 = const_vec; |
||
2054 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
2055 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); |
||
2056 | dst1 = const_vec; |
||
2057 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); |
||
2058 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); |
||
2059 | dst2 = const_vec; |
||
2060 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
2061 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
2062 | dst3 = const_vec; |
||
2063 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
2064 | |||
2065 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2066 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
2067 | |||
2068 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
2069 | ST6x4_UB(dst0, dst1, dst, dst_stride); |
||
2070 | dst += (4 * dst_stride); |
||
2071 | } |
||
2072 | } |
||
2073 | |||
2074 | static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr, |
||
2075 | int32_t src_stride, |
||
2076 | int16_t *src1_ptr, |
||
2077 | int32_t src2_stride, |
||
2078 | uint8_t *dst, |
||
2079 | int32_t dst_stride, |
||
2080 | const int8_t *filter, |
||
2081 | int32_t height) |
||
2082 | { |
||
2083 | v8i16 filt0, filt1; |
||
2084 | v16i8 src0, src1; |
||
2085 | v8i16 in0, in1; |
||
2086 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
2087 | v16i8 mask1, vec0, vec1; |
||
2088 | v8i16 dst0, dst1; |
||
2089 | v8i16 filter_vec, const_vec; |
||
2090 | |||
2091 | src0_ptr -= 1; |
||
2092 | |||
2093 | const_vec = __msa_ldi_h(128); |
||
2094 | const_vec <<= 6; |
||
2095 | |||
2096 | filter_vec = LD_SH(filter); |
||
2097 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2098 | |||
2099 | mask1 = mask0 + 2; |
||
2100 | |||
2101 | LD_SB2(src0_ptr, src_stride, src0, src1); |
||
2102 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
2103 | XORI_B2_128_SB(src0, src1); |
||
2104 | |||
2105 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
2106 | dst0 = const_vec; |
||
2107 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
2108 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); |
||
2109 | dst1 = const_vec; |
||
2110 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); |
||
2111 | HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); |
||
2112 | |||
2113 | dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); |
||
2114 | ST8x2_UB(dst0, dst, dst_stride); |
||
2115 | } |
||
2116 | |||
2117 | static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr, |
||
2118 | int32_t src_stride, |
||
2119 | int16_t *src1_ptr, |
||
2120 | int32_t src2_stride, |
||
2121 | uint8_t *dst, |
||
2122 | int32_t dst_stride, |
||
2123 | const int8_t *filter, |
||
2124 | int32_t height) |
||
2125 | { |
||
2126 | v8i16 filt0, filt1; |
||
2127 | v16i8 src0, src1, src2, src3, src4, src5; |
||
2128 | v8i16 in0, in1, in2, in3, in4, in5; |
||
2129 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
2130 | v16i8 mask1; |
||
2131 | v16i8 vec0, vec1; |
||
2132 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5; |
||
2133 | v8i16 filter_vec, const_vec; |
||
2134 | |||
2135 | src0_ptr -= 1; |
||
2136 | |||
2137 | const_vec = __msa_ldi_h(128); |
||
2138 | const_vec <<= 6; |
||
2139 | |||
2140 | filter_vec = LD_SH(filter); |
||
2141 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2142 | |||
2143 | mask1 = mask0 + 2; |
||
2144 | |||
2145 | LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); |
||
2146 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
2147 | src1_ptr += (4 * src2_stride); |
||
2148 | LD_SH2(src1_ptr, src2_stride, in4, in5); |
||
2149 | XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); |
||
2150 | |||
2151 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
2152 | dst0 = const_vec; |
||
2153 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
2154 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); |
||
2155 | dst1 = const_vec; |
||
2156 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); |
||
2157 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); |
||
2158 | dst2 = const_vec; |
||
2159 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
2160 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
2161 | dst3 = const_vec; |
||
2162 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
2163 | VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); |
||
2164 | dst4 = const_vec; |
||
2165 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
2166 | VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); |
||
2167 | dst5 = const_vec; |
||
2168 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); |
||
2169 | |||
2170 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2171 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
2172 | HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); |
||
2173 | |||
2174 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
2175 | dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); |
||
2176 | ST8x4_UB(dst0, dst1, dst, dst_stride); |
||
2177 | dst += (4 * dst_stride); |
||
2178 | ST8x2_UB(dst2, dst, dst_stride); |
||
2179 | } |
||
2180 | |||
2181 | static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, |
||
2182 | int32_t src_stride, |
||
2183 | int16_t *src1_ptr, |
||
2184 | int32_t src2_stride, |
||
2185 | uint8_t *dst, |
||
2186 | int32_t dst_stride, |
||
2187 | const int8_t *filter, |
||
2188 | int32_t height) |
||
2189 | { |
||
2190 | uint32_t loop_cnt; |
||
2191 | v8i16 filt0, filt1; |
||
2192 | v16i8 src0, src1, src2, src3; |
||
2193 | v8i16 in0, in1, in2, in3; |
||
2194 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
2195 | v16i8 mask1; |
||
2196 | v16i8 vec0, vec1; |
||
2197 | v8i16 dst0, dst1, dst2, dst3; |
||
2198 | v8i16 filter_vec, const_vec; |
||
2199 | |||
2200 | src0_ptr -= 1; |
||
2201 | |||
2202 | const_vec = __msa_ldi_h(128); |
||
2203 | const_vec <<= 6; |
||
2204 | |||
2205 | filter_vec = LD_SH(filter); |
||
2206 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2207 | |||
2208 | mask1 = mask0 + 2; |
||
2209 | |||
2210 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
2211 | LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); |
||
2212 | src0_ptr += (4 * src_stride); |
||
2213 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
2214 | src1_ptr += (4 * src2_stride); |
||
2215 | XORI_B4_128_SB(src0, src1, src2, src3); |
||
2216 | |||
2217 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
2218 | dst0 = const_vec; |
||
2219 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
2220 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); |
||
2221 | dst1 = const_vec; |
||
2222 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); |
||
2223 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); |
||
2224 | dst2 = const_vec; |
||
2225 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
2226 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
2227 | dst3 = const_vec; |
||
2228 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
2229 | |||
2230 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2231 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
2232 | |||
2233 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
2234 | ST8x4_UB(dst0, dst1, dst, dst_stride); |
||
2235 | dst += (4 * dst_stride); |
||
2236 | } |
||
2237 | } |
||
2238 | |||
2239 | static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr, |
||
2240 | int32_t src_stride, |
||
2241 | int16_t *src1_ptr, |
||
2242 | int32_t src2_stride, |
||
2243 | uint8_t *dst, |
||
2244 | int32_t dst_stride, |
||
2245 | const int8_t *filter, |
||
2246 | int32_t height) |
||
2247 | { |
||
2248 | if (2 == height) { |
||
2249 | hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
2250 | dst, dst_stride, filter, height); |
||
2251 | } else if (6 == height) { |
||
2252 | hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
2253 | dst, dst_stride, filter, height); |
||
2254 | } else if (0 == (height % 4)) { |
||
2255 | hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride, |
||
2256 | src1_ptr, src2_stride, |
||
2257 | dst, dst_stride, filter, height); |
||
2258 | } |
||
2259 | } |
||
2260 | |||
2261 | static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr, |
||
2262 | int32_t src_stride, |
||
2263 | int16_t *src1_ptr, |
||
2264 | int32_t src2_stride, |
||
2265 | uint8_t *dst, |
||
2266 | int32_t dst_stride, |
||
2267 | const int8_t *filter, |
||
2268 | int32_t height) |
||
2269 | { |
||
2270 | uint32_t loop_cnt; |
||
2271 | v8i16 filt0, filt1; |
||
2272 | v16i8 src0, src1, src2, src3; |
||
2273 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
2274 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
2275 | v16i8 mask2 = { |
||
2276 | 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 |
||
2277 | }; |
||
2278 | v16i8 mask1, mask3; |
||
2279 | v16i8 vec0, vec1; |
||
2280 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5; |
||
2281 | v8i16 filter_vec, const_vec; |
||
2282 | |||
2283 | src0_ptr -= 1; |
||
2284 | |||
2285 | const_vec = __msa_ldi_h(128); |
||
2286 | const_vec <<= 6; |
||
2287 | |||
2288 | filter_vec = LD_SH(filter); |
||
2289 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2290 | |||
2291 | mask1 = mask0 + 2; |
||
2292 | mask3 = mask2 + 2; |
||
2293 | |||
2294 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
2295 | LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); |
||
2296 | src0_ptr += (4 * src_stride); |
||
2297 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
2298 | LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); |
||
2299 | src1_ptr += (4 * src2_stride); |
||
2300 | |||
2301 | ILVR_D2_SH(in5, in4, in7, in6, in4, in5); |
||
2302 | XORI_B4_128_SB(src0, src1, src2, src3); |
||
2303 | |||
2304 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
2305 | dst0 = const_vec; |
||
2306 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
2307 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); |
||
2308 | dst1 = const_vec; |
||
2309 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); |
||
2310 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); |
||
2311 | dst2 = const_vec; |
||
2312 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
2313 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
2314 | dst3 = const_vec; |
||
2315 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
2316 | VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); |
||
2317 | dst4 = const_vec; |
||
2318 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
2319 | VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); |
||
2320 | dst5 = const_vec; |
||
2321 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); |
||
2322 | |||
2323 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2324 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
2325 | HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); |
||
2326 | |||
2327 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
2328 | dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); |
||
2329 | ST12x4_UB(dst0, dst1, dst2, dst, dst_stride); |
||
2330 | dst += (4 * dst_stride); |
||
2331 | } |
||
2332 | } |
||
2333 | |||
2334 | static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr, |
||
2335 | int32_t src_stride, |
||
2336 | int16_t *src1_ptr, |
||
2337 | int32_t src2_stride, |
||
2338 | uint8_t *dst, |
||
2339 | int32_t dst_stride, |
||
2340 | const int8_t *filter, |
||
2341 | int32_t height) |
||
2342 | { |
||
2343 | uint32_t loop_cnt; |
||
2344 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7; |
||
2345 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
2346 | v8i16 filt0, filt1; |
||
2347 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
2348 | v16i8 mask1; |
||
2349 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; |
||
2350 | v16i8 vec0, vec1; |
||
2351 | v8i16 filter_vec, const_vec; |
||
2352 | |||
2353 | src0_ptr -= 1; |
||
2354 | |||
2355 | const_vec = __msa_ldi_h(128); |
||
2356 | const_vec <<= 6; |
||
2357 | |||
2358 | filter_vec = LD_SH(filter); |
||
2359 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2360 | |||
2361 | mask1 = mask0 + 2; |
||
2362 | |||
2363 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
2364 | LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6); |
||
2365 | LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7); |
||
2366 | src0_ptr += (4 * src_stride); |
||
2367 | LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6); |
||
2368 | LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7); |
||
2369 | src1_ptr += (4 * src2_stride); |
||
2370 | XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); |
||
2371 | |||
2372 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
2373 | dst0 = const_vec; |
||
2374 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
2375 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); |
||
2376 | dst1 = const_vec; |
||
2377 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); |
||
2378 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); |
||
2379 | dst2 = const_vec; |
||
2380 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
2381 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
2382 | dst3 = const_vec; |
||
2383 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
2384 | VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); |
||
2385 | dst4 = const_vec; |
||
2386 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
2387 | VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); |
||
2388 | dst5 = const_vec; |
||
2389 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); |
||
2390 | VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); |
||
2391 | dst6 = const_vec; |
||
2392 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); |
||
2393 | VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); |
||
2394 | dst7 = const_vec; |
||
2395 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); |
||
2396 | |||
2397 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2398 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
2399 | HEVC_BI_RND_CLIP4(in4, in5, in6, in7, |
||
2400 | dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7); |
||
2401 | |||
2402 | PCKEV_B4_SH(dst1, dst0, dst3, dst2, |
||
2403 | dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3); |
||
2404 | ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); |
||
2405 | dst += (4 * dst_stride); |
||
2406 | } |
||
2407 | } |
||
2408 | |||
2409 | static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr, |
||
2410 | int32_t src_stride, |
||
2411 | int16_t *src1_ptr, |
||
2412 | int32_t src2_stride, |
||
2413 | uint8_t *dst, |
||
2414 | int32_t dst_stride, |
||
2415 | const int8_t *filter, |
||
2416 | int32_t height) |
||
2417 | { |
||
2418 | int16_t *src1_ptr_tmp; |
||
2419 | uint8_t *dst_tmp; |
||
2420 | uint32_t loop_cnt; |
||
2421 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7; |
||
2422 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
2423 | v8i16 filt0, filt1; |
||
2424 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
2425 | v16i8 mask1, mask2, mask3; |
||
2426 | v16i8 vec0, vec1; |
||
2427 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; |
||
2428 | v8i16 filter_vec, const_vec; |
||
2429 | |||
2430 | src0_ptr -= 1; |
||
2431 | |||
2432 | const_vec = __msa_ldi_h(128); |
||
2433 | const_vec <<= 6; |
||
2434 | |||
2435 | filter_vec = LD_SH(filter); |
||
2436 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2437 | |||
2438 | mask1 = mask0 + 2; |
||
2439 | mask2 = mask0 + 8; |
||
2440 | mask3 = mask0 + 10; |
||
2441 | |||
2442 | dst_tmp = dst + 16; |
||
2443 | src1_ptr_tmp = src1_ptr + 16; |
||
2444 | |||
2445 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
2446 | LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6); |
||
2447 | LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7); |
||
2448 | src0_ptr += (4 * src_stride); |
||
2449 | LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6); |
||
2450 | LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7); |
||
2451 | src1_ptr += (4 * src2_stride); |
||
2452 | XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); |
||
2453 | |||
2454 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
2455 | dst0 = const_vec; |
||
2456 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
2457 | VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); |
||
2458 | dst1 = const_vec; |
||
2459 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); |
||
2460 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); |
||
2461 | dst2 = const_vec; |
||
2462 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
2463 | VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); |
||
2464 | dst3 = const_vec; |
||
2465 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
2466 | VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); |
||
2467 | dst4 = const_vec; |
||
2468 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
2469 | VSHF_B2_SB(src4, src5, src4, src5, mask2, mask3, vec0, vec1); |
||
2470 | dst5 = const_vec; |
||
2471 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); |
||
2472 | VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); |
||
2473 | dst6 = const_vec; |
||
2474 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); |
||
2475 | VSHF_B2_SB(src6, src7, src6, src7, mask2, mask3, vec0, vec1); |
||
2476 | dst7 = const_vec; |
||
2477 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); |
||
2478 | |||
2479 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2480 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
2481 | HEVC_BI_RND_CLIP4(in4, in5, in6, in7, |
||
2482 | dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7); |
||
2483 | |||
2484 | PCKEV_B4_SH(dst1, dst0, dst3, dst2, |
||
2485 | dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3); |
||
2486 | ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); |
||
2487 | dst += (4 * dst_stride); |
||
2488 | |||
2489 | LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); |
||
2490 | src1_ptr_tmp += (4 * src2_stride); |
||
2491 | |||
2492 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); |
||
2493 | dst0 = const_vec; |
||
2494 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
2495 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
2496 | dst1 = const_vec; |
||
2497 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); |
||
2498 | VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); |
||
2499 | dst2 = const_vec; |
||
2500 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
2501 | VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); |
||
2502 | dst3 = const_vec; |
||
2503 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
2504 | |||
2505 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2506 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
2507 | |||
2508 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
2509 | ST8x4_UB(dst0, dst1, dst_tmp, dst_stride); |
||
2510 | dst_tmp += (4 * dst_stride); |
||
2511 | } |
||
2512 | } |
||
2513 | |||
2514 | static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr, |
||
2515 | int32_t src_stride, |
||
2516 | int16_t *src1_ptr, |
||
2517 | int32_t src2_stride, |
||
2518 | uint8_t *dst, |
||
2519 | int32_t dst_stride, |
||
2520 | const int8_t *filter, |
||
2521 | int32_t height) |
||
2522 | { |
||
2523 | uint32_t loop_cnt; |
||
2524 | v16i8 src0, src1, src2; |
||
2525 | v8i16 in0, in1, in2, in3; |
||
2526 | v8i16 filt0, filt1; |
||
2527 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
2528 | v16i8 mask1, mask2, mask3; |
||
2529 | v8i16 dst0, dst1, dst2, dst3; |
||
2530 | v16i8 vec0, vec1; |
||
2531 | v8i16 filter_vec, const_vec; |
||
2532 | |||
2533 | src0_ptr -= 1; |
||
2534 | |||
2535 | const_vec = __msa_ldi_h(128); |
||
2536 | const_vec <<= 6; |
||
2537 | |||
2538 | filter_vec = LD_SH(filter); |
||
2539 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2540 | |||
2541 | mask1 = mask0 + 2; |
||
2542 | mask2 = mask0 + 8; |
||
2543 | mask3 = mask0 + 10; |
||
2544 | |||
2545 | for (loop_cnt = (height >> 1); loop_cnt--;) { |
||
2546 | LD_SB2(src0_ptr, 16, src0, src1); |
||
2547 | src2 = LD_SB(src0_ptr + 24); |
||
2548 | src0_ptr += src_stride; |
||
2549 | LD_SH4(src1_ptr, 8, in0, in1, in2, in3); |
||
2550 | src1_ptr += src2_stride; |
||
2551 | XORI_B3_128_SB(src0, src1, src2); |
||
2552 | |||
2553 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
2554 | dst0 = const_vec; |
||
2555 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
2556 | VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); |
||
2557 | dst1 = const_vec; |
||
2558 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); |
||
2559 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); |
||
2560 | dst2 = const_vec; |
||
2561 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
2562 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); |
||
2563 | dst3 = const_vec; |
||
2564 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
2565 | |||
2566 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2567 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
2568 | |||
2569 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
2570 | ST_SH2(dst0, dst1, dst, 16); |
||
2571 | dst += dst_stride; |
||
2572 | |||
2573 | LD_SB2(src0_ptr, 16, src0, src1); |
||
2574 | src2 = LD_SB(src0_ptr + 24); |
||
2575 | src0_ptr += src_stride; |
||
2576 | LD_SH4(src1_ptr, 8, in0, in1, in2, in3); |
||
2577 | src1_ptr += src2_stride; |
||
2578 | XORI_B3_128_SB(src0, src1, src2); |
||
2579 | |||
2580 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
2581 | dst0 = const_vec; |
||
2582 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
2583 | VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); |
||
2584 | dst1 = const_vec; |
||
2585 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); |
||
2586 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); |
||
2587 | dst2 = const_vec; |
||
2588 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
2589 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); |
||
2590 | dst3 = const_vec; |
||
2591 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
2592 | |||
2593 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2594 | dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); |
||
2595 | |||
2596 | PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); |
||
2597 | ST_SH2(dst0, dst1, dst, 16); |
||
2598 | dst += dst_stride; |
||
2599 | } |
||
2600 | } |
||
2601 | |||
2602 | static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr, |
||
2603 | int32_t src_stride, |
||
2604 | int16_t *src1_ptr, |
||
2605 | int32_t src2_stride, |
||
2606 | uint8_t *dst, |
||
2607 | int32_t dst_stride, |
||
2608 | const int8_t *filter, |
||
2609 | int32_t height) |
||
2610 | { |
||
2611 | v16i8 src0, src1, src2, src3, src4; |
||
2612 | v8i16 in0, in1; |
||
2613 | v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; |
||
2614 | v8i16 dst10; |
||
2615 | v8i16 filt0, filt1; |
||
2616 | v8i16 filter_vec, const_vec; |
||
2617 | |||
2618 | src0_ptr -= src_stride; |
||
2619 | |||
2620 | const_vec = __msa_ldi_h(128); |
||
2621 | const_vec <<= 6; |
||
2622 | |||
2623 | filter_vec = LD_SH(filter); |
||
2624 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2625 | |||
2626 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
2627 | src0_ptr += (3 * src_stride); |
||
2628 | |||
2629 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
2630 | src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); |
||
2631 | src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); |
||
2632 | |||
2633 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
2634 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
2635 | in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); |
||
2636 | ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); |
||
2637 | src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); |
||
2638 | src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); |
||
2639 | |||
2640 | dst10 = const_vec; |
||
2641 | DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); |
||
2642 | dst10 = __msa_adds_s_h(dst10, in0); |
||
2643 | dst10 = __msa_srari_h(dst10, 7); |
||
2644 | dst10 = CLIP_SH_0_255(dst10); |
||
2645 | |||
2646 | dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10); |
||
2647 | ST4x2_UB(dst10, dst, dst_stride); |
||
2648 | } |
||
2649 | |||
2650 | static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr, |
||
2651 | int32_t src_stride, |
||
2652 | int16_t *src1_ptr, |
||
2653 | int32_t src2_stride, |
||
2654 | uint8_t *dst, |
||
2655 | int32_t dst_stride, |
||
2656 | const int8_t *filter, |
||
2657 | int32_t height) |
||
2658 | { |
||
2659 | v16i8 src0, src1, src2, src3, src4, src5, src6; |
||
2660 | v8i16 in0, in1, in2, in3; |
||
2661 | v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; |
||
2662 | v16i8 src2110, src4332, src6554; |
||
2663 | v8i16 dst10, dst32; |
||
2664 | v8i16 filt0, filt1; |
||
2665 | v8i16 filter_vec, const_vec; |
||
2666 | |||
2667 | src0_ptr -= src_stride; |
||
2668 | |||
2669 | const_vec = __msa_ldi_h(128); |
||
2670 | const_vec <<= 6; |
||
2671 | |||
2672 | filter_vec = LD_SH(filter); |
||
2673 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2674 | |||
2675 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
2676 | src0_ptr += (3 * src_stride); |
||
2677 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
2678 | src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); |
||
2679 | src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); |
||
2680 | |||
2681 | LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); |
||
2682 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
2683 | ILVR_D2_SH(in1, in0, in3, in2, in0, in1); |
||
2684 | ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, |
||
2685 | src32_r, src43_r, src54_r, src65_r); |
||
2686 | ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554); |
||
2687 | XORI_B2_128_SB(src4332, src6554); |
||
2688 | |||
2689 | dst10 = const_vec; |
||
2690 | DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); |
||
2691 | dst32 = const_vec; |
||
2692 | DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); |
||
2693 | HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32); |
||
2694 | |||
2695 | dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10); |
||
2696 | ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride); |
||
2697 | } |
||
2698 | |||
2699 | static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, |
||
2700 | int32_t src_stride, |
||
2701 | int16_t *src1_ptr, |
||
2702 | int32_t src2_stride, |
||
2703 | uint8_t *dst, |
||
2704 | int32_t dst_stride, |
||
2705 | const int8_t *filter, |
||
2706 | int32_t height) |
||
2707 | { |
||
2708 | int32_t loop_cnt; |
||
2709 | v16i8 src0, src1, src2, src3, src4, src5; |
||
2710 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
2711 | v16i8 src6, src7, src8, src9; |
||
2712 | v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; |
||
2713 | v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; |
||
2714 | v16i8 src2110, src4332, src6554, src8776; |
||
2715 | v8i16 dst10, dst32, dst54, dst76; |
||
2716 | v8i16 filt0, filt1; |
||
2717 | v8i16 filter_vec, const_vec; |
||
2718 | |||
2719 | src0_ptr -= src_stride; |
||
2720 | |||
2721 | const_vec = __msa_ldi_h(128); |
||
2722 | const_vec <<= 6; |
||
2723 | |||
2724 | filter_vec = LD_SH(filter); |
||
2725 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2726 | |||
2727 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
2728 | src0_ptr += (3 * src_stride); |
||
2729 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
2730 | src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); |
||
2731 | src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); |
||
2732 | |||
2733 | for (loop_cnt = (height >> 3); loop_cnt--;) { |
||
2734 | LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); |
||
2735 | src0_ptr += (6 * src_stride); |
||
2736 | LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); |
||
2737 | src1_ptr += (8 * src2_stride); |
||
2738 | ILVR_D2_SH(in1, in0, in3, in2, in0, in1); |
||
2739 | ILVR_D2_SH(in5, in4, in7, in6, in2, in3); |
||
2740 | ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, |
||
2741 | src32_r, src43_r, src54_r, src65_r); |
||
2742 | ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); |
||
2743 | ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, |
||
2744 | src4332, src6554, src8776); |
||
2745 | XORI_B3_128_SB(src4332, src6554, src8776); |
||
2746 | |||
2747 | dst10 = const_vec; |
||
2748 | DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); |
||
2749 | dst32 = const_vec; |
||
2750 | DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); |
||
2751 | dst54 = const_vec; |
||
2752 | DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); |
||
2753 | |||
2754 | LD_SB2(src0_ptr, src_stride, src9, src2); |
||
2755 | src0_ptr += (2 * src_stride); |
||
2756 | ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r); |
||
2757 | src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r); |
||
2758 | src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); |
||
2759 | dst76 = const_vec; |
||
2760 | DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76); |
||
2761 | |||
2762 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2763 | dst10, dst32, dst54, dst76, 7, |
||
2764 | dst10, dst32, dst54, dst76); |
||
2765 | |||
2766 | PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54); |
||
2767 | ST4x8_UB(dst10, dst54, dst, dst_stride); |
||
2768 | dst += (8 * dst_stride); |
||
2769 | } |
||
2770 | } |
||
2771 | |||
2772 | static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr, |
||
2773 | int32_t src_stride, |
||
2774 | int16_t *src1_ptr, |
||
2775 | int32_t src2_stride, |
||
2776 | uint8_t *dst, |
||
2777 | int32_t dst_stride, |
||
2778 | const int8_t *filter, |
||
2779 | int32_t height) |
||
2780 | { |
||
2781 | if (2 == height) { |
||
2782 | hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
2783 | dst, dst_stride, filter, height); |
||
2784 | } else if (4 == height) { |
||
2785 | hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
2786 | dst, dst_stride, filter, height); |
||
2787 | } else { |
||
2788 | hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride, |
||
2789 | src1_ptr, src2_stride, |
||
2790 | dst, dst_stride, filter, height); |
||
2791 | } |
||
2792 | } |
||
2793 | |||
2794 | static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr, |
||
2795 | int32_t src_stride, |
||
2796 | int16_t *src1_ptr, |
||
2797 | int32_t src2_stride, |
||
2798 | uint8_t *dst, |
||
2799 | int32_t dst_stride, |
||
2800 | const int8_t *filter, |
||
2801 | int32_t height) |
||
2802 | { |
||
2803 | int32_t loop_cnt; |
||
2804 | v16i8 src0, src1, src2, src3, src4, src5; |
||
2805 | v8i16 in0, in1, in2, in3; |
||
2806 | v16i8 src10_r, src32_r, src21_r, src43_r; |
||
2807 | v8i16 dst0_r, dst1_r, dst2_r, dst3_r; |
||
2808 | v8i16 filt0, filt1; |
||
2809 | v8i16 filter_vec, const_vec; |
||
2810 | |||
2811 | src0_ptr -= src_stride; |
||
2812 | |||
2813 | const_vec = __msa_ldi_h(128); |
||
2814 | const_vec <<= 6; |
||
2815 | |||
2816 | filter_vec = LD_SH(filter); |
||
2817 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2818 | |||
2819 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
2820 | src0_ptr += (3 * src_stride); |
||
2821 | XORI_B3_128_SB(src0, src1, src2); |
||
2822 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
2823 | |||
2824 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
2825 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
2826 | src0_ptr += (2 * src_stride); |
||
2827 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
2828 | src1_ptr += (4 * src2_stride); |
||
2829 | XORI_B2_128_SB(src3, src4); |
||
2830 | ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); |
||
2831 | |||
2832 | dst0_r = const_vec; |
||
2833 | DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); |
||
2834 | dst1_r = const_vec; |
||
2835 | DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); |
||
2836 | |||
2837 | LD_SB2(src0_ptr, src_stride, src5, src2); |
||
2838 | src0_ptr += (2 * src_stride); |
||
2839 | XORI_B2_128_SB(src5, src2); |
||
2840 | ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); |
||
2841 | |||
2842 | dst2_r = const_vec; |
||
2843 | DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); |
||
2844 | dst3_r = const_vec; |
||
2845 | DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); |
||
2846 | |||
2847 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2848 | dst0_r, dst1_r, dst2_r, dst3_r, 7, |
||
2849 | dst0_r, dst1_r, dst2_r, dst3_r); |
||
2850 | |||
2851 | PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); |
||
2852 | ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); |
||
2853 | dst += (4 * dst_stride); |
||
2854 | } |
||
2855 | } |
||
2856 | |||
2857 | static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr, |
||
2858 | int32_t src_stride, |
||
2859 | int16_t *src1_ptr, |
||
2860 | int32_t src2_stride, |
||
2861 | uint8_t *dst, |
||
2862 | int32_t dst_stride, |
||
2863 | const int8_t *filter, |
||
2864 | int32_t height) |
||
2865 | { |
||
2866 | v16i8 src0, src1, src2, src3, src4; |
||
2867 | v8i16 in0, in1, dst0_r, dst1_r; |
||
2868 | v16i8 src10_r, src32_r, src21_r, src43_r; |
||
2869 | v8i16 filt0, filt1; |
||
2870 | v8i16 filter_vec, const_vec; |
||
2871 | |||
2872 | src0_ptr -= src_stride; |
||
2873 | |||
2874 | const_vec = __msa_ldi_h(128); |
||
2875 | const_vec <<= 6; |
||
2876 | |||
2877 | filter_vec = LD_SH(filter); |
||
2878 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2879 | |||
2880 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
2881 | src0_ptr += (3 * src_stride); |
||
2882 | XORI_B3_128_SB(src0, src1, src2); |
||
2883 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
2884 | |||
2885 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
2886 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
2887 | XORI_B2_128_SB(src3, src4); |
||
2888 | ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); |
||
2889 | |||
2890 | dst0_r = const_vec; |
||
2891 | DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); |
||
2892 | dst1_r = const_vec; |
||
2893 | DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); |
||
2894 | |||
2895 | HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r); |
||
2896 | dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); |
||
2897 | |||
2898 | ST8x2_UB(dst0_r, dst, dst_stride); |
||
2899 | } |
||
2900 | |||
2901 | static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr, |
||
2902 | int32_t src_stride, |
||
2903 | int16_t *src1_ptr, |
||
2904 | int32_t src2_stride, |
||
2905 | uint8_t *dst, |
||
2906 | int32_t dst_stride, |
||
2907 | const int8_t *filter, |
||
2908 | int32_t height) |
||
2909 | { |
||
2910 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; |
||
2911 | v8i16 in0, in1, in2, in3, in4, in5; |
||
2912 | v16i8 src10_r, src32_r, src54_r, src76_r; |
||
2913 | v16i8 src21_r, src43_r, src65_r, src87_r; |
||
2914 | v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; |
||
2915 | v8i16 filt0, filt1; |
||
2916 | v8i16 filter_vec, const_vec; |
||
2917 | |||
2918 | src0_ptr -= src_stride; |
||
2919 | |||
2920 | const_vec = __msa_ldi_h(128); |
||
2921 | const_vec <<= 6; |
||
2922 | |||
2923 | filter_vec = LD_SH(filter); |
||
2924 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2925 | |||
2926 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
2927 | src0_ptr += (3 * src_stride); |
||
2928 | XORI_B3_128_SB(src0, src1, src2); |
||
2929 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
2930 | |||
2931 | LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); |
||
2932 | LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); |
||
2933 | XORI_B6_128_SB(src3, src4, src5, src6, src7, src8); |
||
2934 | ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, |
||
2935 | src32_r, src43_r, src54_r, src65_r); |
||
2936 | ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); |
||
2937 | |||
2938 | dst0_r = const_vec; |
||
2939 | DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); |
||
2940 | dst1_r = const_vec; |
||
2941 | DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); |
||
2942 | dst2_r = const_vec; |
||
2943 | DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); |
||
2944 | dst3_r = const_vec; |
||
2945 | DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); |
||
2946 | dst4_r = const_vec; |
||
2947 | DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r); |
||
2948 | dst5_r = const_vec; |
||
2949 | DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r); |
||
2950 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
2951 | dst0_r, dst1_r, dst2_r, dst3_r, 7, |
||
2952 | dst0_r, dst1_r, dst2_r, dst3_r); |
||
2953 | HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r); |
||
2954 | |||
2955 | PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); |
||
2956 | dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r); |
||
2957 | ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); |
||
2958 | dst += (4 * dst_stride); |
||
2959 | ST8x2_UB(dst2_r, dst, dst_stride); |
||
2960 | } |
||
2961 | |||
2962 | static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, |
||
2963 | int32_t src_stride, |
||
2964 | int16_t *src1_ptr, |
||
2965 | int32_t src2_stride, |
||
2966 | uint8_t *dst, |
||
2967 | int32_t dst_stride, |
||
2968 | const int8_t *filter, |
||
2969 | int32_t height) |
||
2970 | { |
||
2971 | int32_t loop_cnt; |
||
2972 | v16i8 src0, src1, src2, src3, src4, src5; |
||
2973 | v8i16 in0, in1, in2, in3; |
||
2974 | v16i8 src10_r, src32_r, src21_r, src43_r; |
||
2975 | v8i16 dst0_r, dst1_r, dst2_r, dst3_r; |
||
2976 | v8i16 filt0, filt1; |
||
2977 | v8i16 filter_vec, const_vec; |
||
2978 | |||
2979 | src0_ptr -= src_stride; |
||
2980 | |||
2981 | const_vec = __msa_ldi_h(128); |
||
2982 | const_vec <<= 6; |
||
2983 | |||
2984 | filter_vec = LD_SH(filter); |
||
2985 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
2986 | |||
2987 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
2988 | src0_ptr += (3 * src_stride); |
||
2989 | XORI_B3_128_SB(src0, src1, src2); |
||
2990 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
2991 | |||
2992 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
2993 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
2994 | src0_ptr += (2 * src_stride); |
||
2995 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
2996 | src1_ptr += (4 * src2_stride); |
||
2997 | XORI_B2_128_SB(src3, src4); |
||
2998 | ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); |
||
2999 | |||
3000 | dst0_r = const_vec; |
||
3001 | DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); |
||
3002 | dst1_r = const_vec; |
||
3003 | DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); |
||
3004 | |||
3005 | LD_SB2(src0_ptr, src_stride, src5, src2); |
||
3006 | src0_ptr += (2 * src_stride); |
||
3007 | XORI_B2_128_SB(src5, src2); |
||
3008 | ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); |
||
3009 | |||
3010 | dst2_r = const_vec; |
||
3011 | DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); |
||
3012 | dst3_r = const_vec; |
||
3013 | DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); |
||
3014 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
3015 | dst0_r, dst1_r, dst2_r, dst3_r, 7, |
||
3016 | dst0_r, dst1_r, dst2_r, dst3_r); |
||
3017 | |||
3018 | PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); |
||
3019 | ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); |
||
3020 | dst += (4 * dst_stride); |
||
3021 | } |
||
3022 | } |
||
3023 | |||
3024 | static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr, |
||
3025 | int32_t src_stride, |
||
3026 | int16_t *src1_ptr, |
||
3027 | int32_t src2_stride, |
||
3028 | uint8_t *dst, |
||
3029 | int32_t dst_stride, |
||
3030 | const int8_t *filter, |
||
3031 | int32_t height) |
||
3032 | { |
||
3033 | if (2 == height) { |
||
3034 | hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
3035 | dst, dst_stride, filter, height); |
||
3036 | } else if (6 == height) { |
||
3037 | hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
3038 | dst, dst_stride, filter, height); |
||
3039 | } else { |
||
3040 | hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride, |
||
3041 | src1_ptr, src2_stride, |
||
3042 | dst, dst_stride, filter, height); |
||
3043 | } |
||
3044 | } |
||
3045 | |||
3046 | static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr, |
||
3047 | int32_t src_stride, |
||
3048 | int16_t *src1_ptr, |
||
3049 | int32_t src2_stride, |
||
3050 | uint8_t *dst, |
||
3051 | int32_t dst_stride, |
||
3052 | const int8_t *filter, |
||
3053 | int32_t height) |
||
3054 | { |
||
3055 | int32_t loop_cnt; |
||
3056 | v16i8 src0, src1, src2, src3, src4, src5; |
||
3057 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
3058 | v16i8 src10_r, src32_r, src21_r, src43_r; |
||
3059 | v8i16 dst0_r, dst1_r, dst2_r, dst3_r; |
||
3060 | v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; |
||
3061 | v16i8 src2110, src4332; |
||
3062 | v8i16 dst0_l, dst1_l, filt0, filt1; |
||
3063 | v8i16 filter_vec, const_vec; |
||
3064 | |||
3065 | src0_ptr -= (1 * src_stride); |
||
3066 | |||
3067 | const_vec = __msa_ldi_h(128); |
||
3068 | const_vec <<= 6; |
||
3069 | |||
3070 | filter_vec = LD_SH(filter); |
||
3071 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
3072 | |||
3073 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
3074 | src0_ptr += (3 * src_stride); |
||
3075 | XORI_B3_128_SB(src0, src1, src2); |
||
3076 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
3077 | ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); |
||
3078 | src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); |
||
3079 | |||
3080 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
3081 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
3082 | src0_ptr += (2 * src_stride); |
||
3083 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
3084 | LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7); |
||
3085 | src1_ptr += (4 * src2_stride); |
||
3086 | ILVR_D2_SH(in5, in4, in7, in6, in4, in5); |
||
3087 | XORI_B2_128_SB(src3, src4); |
||
3088 | |||
3089 | ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); |
||
3090 | ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); |
||
3091 | src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); |
||
3092 | |||
3093 | dst0_r = const_vec; |
||
3094 | DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); |
||
3095 | dst1_r = const_vec; |
||
3096 | DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); |
||
3097 | dst0_l = const_vec; |
||
3098 | DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l); |
||
3099 | |||
3100 | LD_SB2(src0_ptr, src_stride, src5, src2); |
||
3101 | src0_ptr += (2 * src_stride); |
||
3102 | XORI_B2_128_SB(src5, src2); |
||
3103 | |||
3104 | ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); |
||
3105 | ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l); |
||
3106 | src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); |
||
3107 | |||
3108 | dst2_r = const_vec; |
||
3109 | DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); |
||
3110 | dst3_r = const_vec; |
||
3111 | DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); |
||
3112 | dst1_l = const_vec; |
||
3113 | DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l); |
||
3114 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
3115 | dst0_r, dst1_r, dst2_r, dst3_r, 7, |
||
3116 | dst0_r, dst1_r, dst2_r, dst3_r); |
||
3117 | HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l); |
||
3118 | |||
3119 | PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); |
||
3120 | dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l); |
||
3121 | ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride); |
||
3122 | dst += (4 * dst_stride); |
||
3123 | } |
||
3124 | } |
||
3125 | |||
3126 | static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr, |
||
3127 | int32_t src_stride, |
||
3128 | int16_t *src1_ptr, |
||
3129 | int32_t src2_stride, |
||
3130 | uint8_t *dst, |
||
3131 | int32_t dst_stride, |
||
3132 | const int8_t *filter, |
||
3133 | int32_t height) |
||
3134 | { |
||
3135 | int32_t loop_cnt; |
||
3136 | v16i8 src0, src1, src2, src3, src4, src5; |
||
3137 | v8i16 in0, in1, in2, in3; |
||
3138 | v16i8 src10_r, src32_r, src21_r, src43_r; |
||
3139 | v16i8 src10_l, src32_l, src21_l, src43_l; |
||
3140 | v8i16 dst0_r, dst1_r, dst0_l, dst1_l; |
||
3141 | v8i16 filt0, filt1; |
||
3142 | v8i16 filter_vec, const_vec; |
||
3143 | |||
3144 | src0_ptr -= src_stride; |
||
3145 | |||
3146 | const_vec = __msa_ldi_h(128); |
||
3147 | const_vec <<= 6; |
||
3148 | |||
3149 | filter_vec = LD_SH(filter); |
||
3150 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
3151 | |||
3152 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
3153 | src0_ptr += (3 * src_stride); |
||
3154 | XORI_B3_128_SB(src0, src1, src2); |
||
3155 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
3156 | ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); |
||
3157 | |||
3158 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
3159 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
3160 | src0_ptr += (2 * src_stride); |
||
3161 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
3162 | LD_SH2((src1_ptr + 8), src2_stride, in2, in3); |
||
3163 | src1_ptr += (2 * src2_stride); |
||
3164 | XORI_B2_128_SB(src3, src4); |
||
3165 | ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); |
||
3166 | ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); |
||
3167 | |||
3168 | dst0_r = const_vec; |
||
3169 | DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); |
||
3170 | dst1_r = const_vec; |
||
3171 | DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); |
||
3172 | dst0_l = const_vec; |
||
3173 | DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); |
||
3174 | dst1_l = const_vec; |
||
3175 | DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); |
||
3176 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
3177 | dst0_r, dst1_r, dst0_l, dst1_l, 7, |
||
3178 | dst0_r, dst1_r, dst0_l, dst1_l); |
||
3179 | |||
3180 | PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); |
||
3181 | ST_SH2(dst0_r, dst1_r, dst, dst_stride); |
||
3182 | dst += (2 * dst_stride); |
||
3183 | |||
3184 | LD_SB2(src0_ptr, src_stride, src5, src2); |
||
3185 | src0_ptr += (2 * src_stride); |
||
3186 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
3187 | LD_SH2((src1_ptr + 8), src2_stride, in2, in3); |
||
3188 | src1_ptr += (2 * src2_stride); |
||
3189 | XORI_B2_128_SB(src5, src2); |
||
3190 | ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); |
||
3191 | ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); |
||
3192 | |||
3193 | dst0_r = const_vec; |
||
3194 | DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); |
||
3195 | dst0_l = const_vec; |
||
3196 | DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); |
||
3197 | dst1_r = const_vec; |
||
3198 | DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); |
||
3199 | dst1_l = const_vec; |
||
3200 | DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); |
||
3201 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
3202 | dst0_r, dst1_r, dst0_l, dst1_l, 7, |
||
3203 | dst0_r, dst1_r, dst0_l, dst1_l); |
||
3204 | |||
3205 | PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); |
||
3206 | ST_SH2(dst0_r, dst1_r, dst, dst_stride); |
||
3207 | dst += (2 * dst_stride); |
||
3208 | } |
||
3209 | } |
||
3210 | |||
3211 | static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr, |
||
3212 | int32_t src_stride, |
||
3213 | int16_t *src1_ptr, |
||
3214 | int32_t src2_stride, |
||
3215 | uint8_t *dst, |
||
3216 | int32_t dst_stride, |
||
3217 | const int8_t *filter, |
||
3218 | int32_t height) |
||
3219 | { |
||
3220 | uint32_t loop_cnt; |
||
3221 | v16i8 src0, src1, src2, src3, src4, src5; |
||
3222 | v16i8 src6, src7, src8, src9, src10, src11; |
||
3223 | v8i16 in0, in1, in2, in3, in4, in5; |
||
3224 | v16i8 src10_r, src32_r, src76_r, src98_r; |
||
3225 | v16i8 src21_r, src43_r, src87_r, src109_r; |
||
3226 | v16i8 src10_l, src32_l, src21_l, src43_l; |
||
3227 | v8i16 dst0_r, dst1_r, dst2_r, dst3_r; |
||
3228 | v8i16 dst0_l, dst1_l; |
||
3229 | v8i16 filt0, filt1; |
||
3230 | v8i16 filter_vec, const_vec; |
||
3231 | |||
3232 | src0_ptr -= src_stride; |
||
3233 | |||
3234 | const_vec = __msa_ldi_h(128); |
||
3235 | const_vec <<= 6; |
||
3236 | |||
3237 | filter_vec = LD_SH(filter); |
||
3238 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
3239 | |||
3240 | /* 16width */ |
||
3241 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
3242 | XORI_B3_128_SB(src0, src1, src2); |
||
3243 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
3244 | ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); |
||
3245 | /* 8width */ |
||
3246 | LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); |
||
3247 | src0_ptr += (3 * src_stride); |
||
3248 | XORI_B3_128_SB(src6, src7, src8); |
||
3249 | ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); |
||
3250 | |||
3251 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
||
3252 | /* 16width */ |
||
3253 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
3254 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
3255 | LD_SH2((src1_ptr + 8), src2_stride, in2, in3); |
||
3256 | LD_SH2((src1_ptr + 16), src2_stride, in4, in5); |
||
3257 | src1_ptr += (2 * src2_stride); |
||
3258 | XORI_B2_128_SB(src3, src4); |
||
3259 | ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); |
||
3260 | ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); |
||
3261 | /* 8width */ |
||
3262 | LD_SB2(src0_ptr + 16, src_stride, src9, src10); |
||
3263 | src0_ptr += (2 * src_stride); |
||
3264 | XORI_B2_128_SB(src9, src10); |
||
3265 | ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); |
||
3266 | /* 16width */ |
||
3267 | dst0_r = const_vec; |
||
3268 | DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); |
||
3269 | dst0_l = const_vec; |
||
3270 | DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); |
||
3271 | dst1_r = const_vec; |
||
3272 | DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); |
||
3273 | dst1_l = const_vec; |
||
3274 | DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); |
||
3275 | /* 8width */ |
||
3276 | dst2_r = const_vec; |
||
3277 | DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); |
||
3278 | dst3_r = const_vec; |
||
3279 | DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); |
||
3280 | /* 16width */ |
||
3281 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
3282 | dst0_r, dst1_r, dst0_l, dst1_l, 7, |
||
3283 | dst0_r, dst1_r, dst0_l, dst1_l); |
||
3284 | |||
3285 | HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r); |
||
3286 | |||
3287 | PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); |
||
3288 | dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r); |
||
3289 | ST_SH2(dst0_r, dst1_r, dst, dst_stride); |
||
3290 | ST8x2_UB(dst2_r, dst + 16, dst_stride); |
||
3291 | dst += (2 * dst_stride); |
||
3292 | |||
3293 | /* 16width */ |
||
3294 | LD_SB2(src0_ptr, src_stride, src5, src2); |
||
3295 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
3296 | LD_SH2((src1_ptr + 8), src2_stride, in2, in3); |
||
3297 | LD_SH2((src1_ptr + 16), src2_stride, in4, in5); |
||
3298 | src1_ptr += (2 * src2_stride); |
||
3299 | XORI_B2_128_SB(src5, src2); |
||
3300 | ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); |
||
3301 | ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); |
||
3302 | /* 8width */ |
||
3303 | LD_SB2(src0_ptr + 16, src_stride, src11, src8); |
||
3304 | src0_ptr += (2 * src_stride); |
||
3305 | XORI_B2_128_SB(src11, src8); |
||
3306 | ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); |
||
3307 | /* 16width */ |
||
3308 | dst0_r = const_vec; |
||
3309 | DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); |
||
3310 | dst0_l = const_vec; |
||
3311 | DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); |
||
3312 | dst1_r = const_vec; |
||
3313 | DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); |
||
3314 | dst1_l = const_vec; |
||
3315 | DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); |
||
3316 | /* 8width */ |
||
3317 | dst2_r = const_vec; |
||
3318 | DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r); |
||
3319 | dst3_r = const_vec; |
||
3320 | DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r); |
||
3321 | |||
3322 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
3323 | dst0_r, dst1_r, dst0_l, dst1_l, 7, |
||
3324 | dst0_r, dst1_r, dst0_l, dst1_l); |
||
3325 | HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r); |
||
3326 | |||
3327 | PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); |
||
3328 | dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r); |
||
3329 | ST_SH2(dst0_r, dst1_r, dst, dst_stride); |
||
3330 | ST8x2_UB(dst2_r, dst + 16, dst_stride); |
||
3331 | dst += (2 * dst_stride); |
||
3332 | } |
||
3333 | } |
||
3334 | |||
3335 | static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr, |
||
3336 | int32_t src_stride, |
||
3337 | int16_t *src1_ptr, |
||
3338 | int32_t src2_stride, |
||
3339 | uint8_t *dst, |
||
3340 | int32_t dst_stride, |
||
3341 | const int8_t *filter, |
||
3342 | int32_t height) |
||
3343 | { |
||
3344 | uint32_t loop_cnt; |
||
3345 | uint8_t *dst_tmp = dst + 16; |
||
3346 | v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; |
||
3347 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
3348 | v16i8 src10_r, src32_r, src76_r, src98_r; |
||
3349 | v16i8 src21_r, src43_r, src87_r, src109_r; |
||
3350 | v8i16 dst0_r, dst1_r, dst2_r, dst3_r; |
||
3351 | v16i8 src10_l, src32_l, src76_l, src98_l; |
||
3352 | v16i8 src21_l, src43_l, src87_l, src109_l; |
||
3353 | v8i16 dst0_l, dst1_l, dst2_l, dst3_l; |
||
3354 | v8i16 filt0, filt1; |
||
3355 | v8i16 filter_vec, const_vec; |
||
3356 | |||
3357 | src0_ptr -= src_stride; |
||
3358 | |||
3359 | const_vec = __msa_ldi_h(128); |
||
3360 | const_vec <<= 6; |
||
3361 | |||
3362 | filter_vec = LD_SH(filter); |
||
3363 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
3364 | |||
3365 | /* 16width */ |
||
3366 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
3367 | XORI_B3_128_SB(src0, src1, src2); |
||
3368 | ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); |
||
3369 | ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); |
||
3370 | |||
3371 | /* next 16width */ |
||
3372 | LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); |
||
3373 | src0_ptr += (3 * src_stride); |
||
3374 | XORI_B3_128_SB(src6, src7, src8); |
||
3375 | ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); |
||
3376 | ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); |
||
3377 | |||
3378 | for (loop_cnt = (height >> 1); loop_cnt--;) { |
||
3379 | /* 16width */ |
||
3380 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
3381 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
3382 | LD_SH2((src1_ptr + 8), src2_stride, in2, in3); |
||
3383 | LD_SH2((src1_ptr + 16), src2_stride, in4, in5); |
||
3384 | LD_SH2((src1_ptr + 24), src2_stride, in6, in7); |
||
3385 | src1_ptr += (2 * src2_stride); |
||
3386 | XORI_B2_128_SB(src3, src4); |
||
3387 | ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); |
||
3388 | ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); |
||
3389 | /* 16width */ |
||
3390 | dst0_r = const_vec; |
||
3391 | DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); |
||
3392 | dst0_l = const_vec; |
||
3393 | DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); |
||
3394 | dst1_r = const_vec; |
||
3395 | DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); |
||
3396 | dst1_l = const_vec; |
||
3397 | DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); |
||
3398 | /* 16width */ |
||
3399 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
3400 | dst0_r, dst1_r, dst0_l, dst1_l, 7, |
||
3401 | dst0_r, dst1_r, dst0_l, dst1_l); |
||
3402 | |||
3403 | src10_r = src32_r; |
||
3404 | src21_r = src43_r; |
||
3405 | src10_l = src32_l; |
||
3406 | src21_l = src43_l; |
||
3407 | src2 = src4; |
||
3408 | |||
3409 | PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); |
||
3410 | ST_SH2(dst0_r, dst1_r, dst, dst_stride); |
||
3411 | dst += (2 * dst_stride); |
||
3412 | |||
3413 | /* next 16width */ |
||
3414 | LD_SB2(src0_ptr + 16, src_stride, src9, src10); |
||
3415 | src0_ptr += (2 * src_stride); |
||
3416 | XORI_B2_128_SB(src9, src10); |
||
3417 | ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); |
||
3418 | ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); |
||
3419 | /* next 16width */ |
||
3420 | dst2_r = const_vec; |
||
3421 | DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); |
||
3422 | dst2_l = const_vec; |
||
3423 | DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l); |
||
3424 | dst3_r = const_vec; |
||
3425 | DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); |
||
3426 | dst3_l = const_vec; |
||
3427 | DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l); |
||
3428 | /* next 16width */ |
||
3429 | HEVC_BI_RND_CLIP4(in4, in5, in6, in7, |
||
3430 | dst2_r, dst3_r, dst2_l, dst3_l, 7, |
||
3431 | dst2_r, dst3_r, dst2_l, dst3_l); |
||
3432 | |||
3433 | PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r); |
||
3434 | ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride); |
||
3435 | dst_tmp += (2 * dst_stride); |
||
3436 | |||
3437 | src76_r = src98_r; |
||
3438 | src87_r = src109_r; |
||
3439 | src76_l = src98_l; |
||
3440 | src87_l = src109_l; |
||
3441 | src8 = src10; |
||
3442 | } |
||
3443 | } |
||
3444 | |||
3445 | static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr, |
||
3446 | int32_t src_stride, |
||
3447 | int16_t *src1_ptr, |
||
3448 | int32_t src2_stride, |
||
3449 | uint8_t *dst, |
||
3450 | int32_t dst_stride, |
||
3451 | const int8_t *filter_x, |
||
3452 | const int8_t *filter_y, |
||
3453 | int32_t height) |
||
3454 | { |
||
3455 | v8i16 in0, in1; |
||
3456 | v16i8 src0, src1, src2, src3, src4; |
||
3457 | v8i16 filt0, filt1; |
||
3458 | v4i32 filt_h0, filt_h1; |
||
3459 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
3460 | v16i8 mask1; |
||
3461 | v8i16 filter_vec, const_vec; |
||
3462 | v16i8 vec0, vec1, vec2, vec3, vec4, vec5; |
||
3463 | v8i16 dst0, dst1, dst2, dst3, dst4; |
||
3464 | v4i32 dst0_r, dst1_r; |
||
3465 | v8i16 dst10_r, dst32_r, dst21_r, dst43_r; |
||
3466 | |||
3467 | src0_ptr -= (src_stride + 1); |
||
3468 | |||
3469 | filter_vec = LD_SH(filter_x); |
||
3470 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
3471 | |||
3472 | filter_vec = LD_SH(filter_y); |
||
3473 | vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); |
||
3474 | filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); |
||
3475 | |||
3476 | SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); |
||
3477 | |||
3478 | mask1 = mask0 + 2; |
||
3479 | |||
3480 | const_vec = __msa_ldi_h(128); |
||
3481 | const_vec <<= 6; |
||
3482 | |||
3483 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
3484 | src0_ptr += (3 * src_stride); |
||
3485 | XORI_B3_128_SB(src0, src1, src2); |
||
3486 | |||
3487 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
3488 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); |
||
3489 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); |
||
3490 | dst0 = const_vec; |
||
3491 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
3492 | dst1 = const_vec; |
||
3493 | DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); |
||
3494 | dst2 = const_vec; |
||
3495 | DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); |
||
3496 | ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); |
||
3497 | |||
3498 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
3499 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
3500 | in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); |
||
3501 | XORI_B2_128_SB(src3, src4); |
||
3502 | /* row 3 */ |
||
3503 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
3504 | dst3 = const_vec; |
||
3505 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
3506 | dst32_r = __msa_ilvr_h(dst3, dst2); |
||
3507 | dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); |
||
3508 | dst0_r >>= 6; |
||
3509 | /* row 4 */ |
||
3510 | VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); |
||
3511 | dst4 = const_vec; |
||
3512 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
3513 | dst43_r = __msa_ilvr_h(dst4, dst3); |
||
3514 | dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); |
||
3515 | dst1_r >>= 6; |
||
3516 | dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r); |
||
3517 | dst0_r = (v4i32) __msa_adds_s_h((v8i16) dst0_r, in0); |
||
3518 | dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 7); |
||
3519 | dst0_r = (v4i32) CLIP_SH_0_255(dst0_r); |
||
3520 | |||
3521 | dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r); |
||
3522 | ST4x2_UB(dst0_r, dst, dst_stride); |
||
3523 | } |
||
3524 | |||
3525 | static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr, |
||
3526 | int32_t src_stride, |
||
3527 | int16_t *src1_ptr, |
||
3528 | int32_t src2_stride, |
||
3529 | uint8_t *dst, |
||
3530 | int32_t dst_stride, |
||
3531 | const int8_t *filter_x, |
||
3532 | const int8_t *filter_y, |
||
3533 | int32_t height) |
||
3534 | { |
||
3535 | v8i16 in0, in1, in2, in3; |
||
3536 | v16i8 src0, src1, src2, src3, src4, src5, src6; |
||
3537 | v8i16 filt0, filt1; |
||
3538 | v4i32 filt_h0, filt_h1; |
||
3539 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
3540 | v16i8 mask1; |
||
3541 | v8i16 filter_vec, const_vec; |
||
3542 | v16i8 vec0, vec1, vec2, vec3, vec4, vec5; |
||
3543 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5; |
||
3544 | v8i16 dst0_r, dst1_r; |
||
3545 | v4i32 tmp0, tmp1, tmp2, tmp3; |
||
3546 | v8i16 dst10_r, dst32_r, dst21_r, dst43_r; |
||
3547 | |||
3548 | src0_ptr -= (src_stride + 1); |
||
3549 | |||
3550 | filter_vec = LD_SH(filter_x); |
||
3551 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
3552 | |||
3553 | filter_vec = LD_SH(filter_y); |
||
3554 | vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); |
||
3555 | filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); |
||
3556 | |||
3557 | SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); |
||
3558 | |||
3559 | mask1 = mask0 + 2; |
||
3560 | |||
3561 | const_vec = __msa_ldi_h(128); |
||
3562 | const_vec <<= 6; |
||
3563 | |||
3564 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
3565 | src0_ptr += (3 * src_stride); |
||
3566 | XORI_B3_128_SB(src0, src1, src2); |
||
3567 | |||
3568 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
3569 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); |
||
3570 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); |
||
3571 | dst0 = const_vec; |
||
3572 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
3573 | dst1 = const_vec; |
||
3574 | DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); |
||
3575 | dst2 = const_vec; |
||
3576 | DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); |
||
3577 | ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); |
||
3578 | |||
3579 | LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); |
||
3580 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
3581 | ILVR_D2_SH(in1, in0, in3, in2, in0, in1); |
||
3582 | XORI_B4_128_SB(src3, src4, src5, src6); |
||
3583 | /* row 3 */ |
||
3584 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
3585 | dst3 = const_vec; |
||
3586 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
3587 | dst32_r = __msa_ilvr_h(dst3, dst2); |
||
3588 | tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); |
||
3589 | tmp0 >>= 6; |
||
3590 | /* row 4 */ |
||
3591 | VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); |
||
3592 | dst4 = const_vec; |
||
3593 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
3594 | dst43_r = __msa_ilvr_h(dst4, dst3); |
||
3595 | tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); |
||
3596 | tmp1 >>= 6; |
||
3597 | /* row 5 */ |
||
3598 | VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); |
||
3599 | dst5 = const_vec; |
||
3600 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); |
||
3601 | dst10_r = __msa_ilvr_h(dst5, dst4); |
||
3602 | tmp2 = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); |
||
3603 | tmp2 >>= 6; |
||
3604 | /* row 6 */ |
||
3605 | VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); |
||
3606 | dst2 = const_vec; |
||
3607 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
3608 | dst21_r = __msa_ilvr_h(dst2, dst5); |
||
3609 | tmp3 = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); |
||
3610 | tmp3 >>= 6; |
||
3611 | PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r); |
||
3612 | HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r); |
||
3613 | |||
3614 | dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); |
||
3615 | ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); |
||
3616 | dst += (4 * dst_stride); |
||
3617 | } |
||
3618 | |||
3619 | static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr, |
||
3620 | int32_t src_stride, |
||
3621 | int16_t *src1_ptr, |
||
3622 | int32_t src2_stride, |
||
3623 | uint8_t *dst, |
||
3624 | int32_t dst_stride, |
||
3625 | const int8_t *filter_x, |
||
3626 | const int8_t *filter_y, |
||
3627 | int32_t height) |
||
3628 | { |
||
3629 | uint32_t loop_cnt; |
||
3630 | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; |
||
3631 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; |
||
3632 | v8i16 filt0, filt1; |
||
3633 | v4i32 filt_h0, filt_h1; |
||
3634 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
3635 | v16i8 mask1; |
||
3636 | v8i16 filter_vec, const_vec; |
||
3637 | v16i8 vec0, vec1, vec2, vec3, vec4, vec5; |
||
3638 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9; |
||
3639 | v8i16 tmp0, tmp1, tmp2, tmp3; |
||
3640 | v8i16 dst10_r, dst32_r, dst54_r, dst76_r; |
||
3641 | v8i16 dst21_r, dst43_r, dst65_r, dst87_r; |
||
3642 | v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; |
||
3643 | |||
3644 | src0_ptr -= (src_stride + 1); |
||
3645 | |||
3646 | filter_vec = LD_SH(filter_x); |
||
3647 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
3648 | |||
3649 | filter_vec = LD_SH(filter_y); |
||
3650 | vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); |
||
3651 | filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); |
||
3652 | |||
3653 | SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); |
||
3654 | |||
3655 | mask1 = mask0 + 2; |
||
3656 | |||
3657 | const_vec = __msa_ldi_h(128); |
||
3658 | const_vec <<= 6; |
||
3659 | |||
3660 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
3661 | src0_ptr += (3 * src_stride); |
||
3662 | XORI_B3_128_SB(src0, src1, src2); |
||
3663 | |||
3664 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
3665 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); |
||
3666 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); |
||
3667 | dst0 = const_vec; |
||
3668 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
3669 | dst1 = const_vec; |
||
3670 | DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); |
||
3671 | dst2 = const_vec; |
||
3672 | DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); |
||
3673 | ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); |
||
3674 | |||
3675 | for (loop_cnt = height >> 3; loop_cnt--;) { |
||
3676 | LD_SB8(src0_ptr, src_stride, |
||
3677 | src3, src4, src5, src6, src7, src8, src9, src10); |
||
3678 | src0_ptr += (8 * src_stride); |
||
3679 | LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); |
||
3680 | src1_ptr += (8 * src2_stride); |
||
3681 | ILVR_D2_SH(in1, in0, in3, in2, in0, in1); |
||
3682 | ILVR_D2_SH(in5, in4, in7, in6, in2, in3); |
||
3683 | XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); |
||
3684 | /* row 3 */ |
||
3685 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
3686 | dst3 = const_vec; |
||
3687 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
3688 | dst32_r = __msa_ilvr_h(dst3, dst2); |
||
3689 | dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); |
||
3690 | dst0_r >>= 6; |
||
3691 | /* row 4 */ |
||
3692 | VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); |
||
3693 | dst4 = const_vec; |
||
3694 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
3695 | dst43_r = __msa_ilvr_h(dst4, dst3); |
||
3696 | dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); |
||
3697 | dst1_r >>= 6; |
||
3698 | /* row 5 */ |
||
3699 | VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); |
||
3700 | dst5 = const_vec; |
||
3701 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); |
||
3702 | dst54_r = __msa_ilvr_h(dst5, dst4); |
||
3703 | dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); |
||
3704 | dst2_r >>= 6; |
||
3705 | /* row 6 */ |
||
3706 | VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); |
||
3707 | dst6 = const_vec; |
||
3708 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); |
||
3709 | dst65_r = __msa_ilvr_h(dst6, dst5); |
||
3710 | dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); |
||
3711 | dst3_r >>= 6; |
||
3712 | /* row 7 */ |
||
3713 | VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); |
||
3714 | dst7 = const_vec; |
||
3715 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); |
||
3716 | dst76_r = __msa_ilvr_h(dst7, dst6); |
||
3717 | dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); |
||
3718 | dst4_r >>= 6; |
||
3719 | /* row 8 */ |
||
3720 | VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); |
||
3721 | dst8 = const_vec; |
||
3722 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); |
||
3723 | dst87_r = __msa_ilvr_h(dst8, dst7); |
||
3724 | dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); |
||
3725 | dst5_r >>= 6; |
||
3726 | /* row 9 */ |
||
3727 | VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1); |
||
3728 | dst9 = const_vec; |
||
3729 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9); |
||
3730 | dst10_r = __msa_ilvr_h(dst9, dst8); |
||
3731 | dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1); |
||
3732 | dst6_r >>= 6; |
||
3733 | /* row 10 */ |
||
3734 | VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1); |
||
3735 | dst2 = const_vec; |
||
3736 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
3737 | dst21_r = __msa_ilvr_h(dst2, dst9); |
||
3738 | dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1); |
||
3739 | dst7_r >>= 6; |
||
3740 | PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r, |
||
3741 | dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3); |
||
3742 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
3743 | tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3); |
||
3744 | |||
3745 | PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); |
||
3746 | ST4x8_UB(tmp0, tmp1, dst, dst_stride); |
||
3747 | dst += (8 * dst_stride); |
||
3748 | } |
||
3749 | } |
||
3750 | |||
3751 | static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr, |
||
3752 | int32_t src_stride, |
||
3753 | int16_t *src1_ptr, |
||
3754 | int32_t src2_stride, |
||
3755 | uint8_t *dst, |
||
3756 | int32_t dst_stride, |
||
3757 | const int8_t *filter_x, |
||
3758 | const int8_t *filter_y, |
||
3759 | int32_t height) |
||
3760 | { |
||
3761 | if (2 == height) { |
||
3762 | hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
3763 | dst, dst_stride, filter_x, filter_y, height); |
||
3764 | } else if (4 == height) { |
||
3765 | hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
3766 | dst, dst_stride, filter_x, filter_y, height); |
||
3767 | } else if (0 == (height % 8)) { |
||
3768 | hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride, |
||
3769 | src1_ptr, src2_stride, |
||
3770 | dst, dst_stride, |
||
3771 | filter_x, filter_y, height); |
||
3772 | } |
||
3773 | } |
||
3774 | |||
3775 | static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr, |
||
3776 | int32_t src_stride, |
||
3777 | int16_t *src1_ptr, |
||
3778 | int32_t src2_stride, |
||
3779 | uint8_t *dst, |
||
3780 | int32_t dst_stride, |
||
3781 | const int8_t *filter_x, |
||
3782 | const int8_t *filter_y, |
||
3783 | int32_t height) |
||
3784 | { |
||
3785 | uint32_t loop_cnt; |
||
3786 | v16i8 src0, src1, src2, src3, src4, src5, src6; |
||
3787 | v8i16 in0, in1, in2, in3; |
||
3788 | v8i16 filt0, filt1; |
||
3789 | v4i32 filt_h0, filt_h1; |
||
3790 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
3791 | v16i8 mask1; |
||
3792 | v8i16 filter_vec, const_vec; |
||
3793 | v16i8 vec0, vec1, vec2, vec3, vec4, vec5; |
||
3794 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5; |
||
3795 | v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; |
||
3796 | v8i16 tmp0, tmp1, tmp2, tmp3; |
||
3797 | v8i16 dst10_r, dst32_r, dst21_r, dst43_r; |
||
3798 | v8i16 dst10_l, dst32_l, dst21_l, dst43_l; |
||
3799 | |||
3800 | src0_ptr -= (src_stride + 1); |
||
3801 | |||
3802 | filter_vec = LD_SH(filter_x); |
||
3803 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
3804 | |||
3805 | filter_vec = LD_SH(filter_y); |
||
3806 | vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); |
||
3807 | filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); |
||
3808 | |||
3809 | SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); |
||
3810 | |||
3811 | mask1 = mask0 + 2; |
||
3812 | |||
3813 | const_vec = __msa_ldi_h(128); |
||
3814 | const_vec <<= 6; |
||
3815 | |||
3816 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
3817 | src0_ptr += (3 * src_stride); |
||
3818 | XORI_B3_128_SB(src0, src1, src2); |
||
3819 | |||
3820 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
3821 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); |
||
3822 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); |
||
3823 | dst0 = const_vec; |
||
3824 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
3825 | dst1 = const_vec; |
||
3826 | DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); |
||
3827 | dst2 = const_vec; |
||
3828 | DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); |
||
3829 | |||
3830 | ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); |
||
3831 | ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); |
||
3832 | |||
3833 | for (loop_cnt = height >> 2; loop_cnt--;) { |
||
3834 | LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); |
||
3835 | src0_ptr += (4 * src_stride); |
||
3836 | LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); |
||
3837 | src1_ptr += (4 * src2_stride); |
||
3838 | XORI_B4_128_SB(src3, src4, src5, src6); |
||
3839 | |||
3840 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
3841 | dst3 = const_vec; |
||
3842 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
3843 | |||
3844 | ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); |
||
3845 | dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); |
||
3846 | dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); |
||
3847 | dst0_r >>= 6; |
||
3848 | dst0_l >>= 6; |
||
3849 | |||
3850 | VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); |
||
3851 | dst4 = const_vec; |
||
3852 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
3853 | |||
3854 | ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); |
||
3855 | dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); |
||
3856 | dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); |
||
3857 | dst1_r >>= 6; |
||
3858 | dst1_l >>= 6; |
||
3859 | |||
3860 | VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); |
||
3861 | dst5 = const_vec; |
||
3862 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); |
||
3863 | |||
3864 | ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); |
||
3865 | dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); |
||
3866 | dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); |
||
3867 | dst2_r >>= 6; |
||
3868 | dst2_l >>= 6; |
||
3869 | |||
3870 | VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); |
||
3871 | dst2 = const_vec; |
||
3872 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
3873 | |||
3874 | ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); |
||
3875 | dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); |
||
3876 | dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); |
||
3877 | dst3_r >>= 6; |
||
3878 | dst3_l >>= 6; |
||
3879 | PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, |
||
3880 | dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); |
||
3881 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
3882 | tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3); |
||
3883 | |||
3884 | PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r); |
||
3885 | ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); |
||
3886 | dst += (4 * dst_stride); |
||
3887 | } |
||
3888 | } |
||
3889 | |||
3890 | static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr, |
||
3891 | int32_t src_stride, |
||
3892 | int16_t *src1_ptr, |
||
3893 | int32_t src2_stride, |
||
3894 | uint8_t *dst, |
||
3895 | int32_t dst_stride, |
||
3896 | const int8_t *filter_x, |
||
3897 | const int8_t *filter_y, |
||
3898 | int32_t height) |
||
3899 | { |
||
3900 | v16i8 src0, src1, src2, src3, src4; |
||
3901 | v8i16 filt0, filt1; |
||
3902 | v4i32 filt_h0, filt_h1; |
||
3903 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
3904 | v16i8 mask1; |
||
3905 | v8i16 filter_vec, const_vec; |
||
3906 | v16i8 vec0, vec1, vec2, vec3, vec4, vec5; |
||
3907 | v8i16 dst0, dst1, dst2, dst3, dst4; |
||
3908 | v4i32 dst0_r, dst0_l, dst1_r, dst1_l; |
||
3909 | v8i16 dst10_r, dst32_r, dst21_r, dst43_r; |
||
3910 | v8i16 dst10_l, dst32_l, dst21_l, dst43_l; |
||
3911 | v8i16 tmp0, tmp1; |
||
3912 | v8i16 in0, in1; |
||
3913 | |||
3914 | src0_ptr -= (src_stride + 1); |
||
3915 | |||
3916 | filter_vec = LD_SH(filter_x); |
||
3917 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
3918 | |||
3919 | filter_vec = LD_SH(filter_y); |
||
3920 | vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); |
||
3921 | filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); |
||
3922 | |||
3923 | SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); |
||
3924 | |||
3925 | mask1 = mask0 + 2; |
||
3926 | |||
3927 | const_vec = __msa_ldi_h(128); |
||
3928 | const_vec <<= 6; |
||
3929 | |||
3930 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
3931 | src0_ptr += (3 * src_stride); |
||
3932 | XORI_B3_128_SB(src0, src1, src2); |
||
3933 | |||
3934 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
3935 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); |
||
3936 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); |
||
3937 | dst0 = const_vec; |
||
3938 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
3939 | dst1 = const_vec; |
||
3940 | DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); |
||
3941 | dst2 = const_vec; |
||
3942 | DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); |
||
3943 | |||
3944 | ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); |
||
3945 | ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); |
||
3946 | |||
3947 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
3948 | LD_SH2(src1_ptr, src2_stride, in0, in1); |
||
3949 | XORI_B2_128_SB(src3, src4); |
||
3950 | |||
3951 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
3952 | dst3 = const_vec; |
||
3953 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
3954 | |||
3955 | ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); |
||
3956 | dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); |
||
3957 | dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); |
||
3958 | dst0_r >>= 6; |
||
3959 | dst0_l >>= 6; |
||
3960 | |||
3961 | VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); |
||
3962 | dst4 = const_vec; |
||
3963 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
3964 | |||
3965 | ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); |
||
3966 | dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); |
||
3967 | dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); |
||
3968 | dst1_r >>= 6; |
||
3969 | dst1_l >>= 6; |
||
3970 | |||
3971 | PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); |
||
3972 | HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1); |
||
3973 | |||
3974 | dst0_r = (v4i32) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); |
||
3975 | ST8x2_UB(dst0_r, dst, dst_stride); |
||
3976 | } |
||
3977 | |||
3978 | static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr, |
||
3979 | int32_t src_stride, |
||
3980 | int16_t *src1_ptr, |
||
3981 | int32_t src2_stride, |
||
3982 | uint8_t *dst, |
||
3983 | int32_t dst_stride, |
||
3984 | const int8_t *filter_x, |
||
3985 | const int8_t *filter_y, |
||
3986 | int32_t height) |
||
3987 | { |
||
3988 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; |
||
3989 | v8i16 in0, in1, in2, in3, in4, in5; |
||
3990 | v8i16 filt0, filt1; |
||
3991 | v4i32 filt_h0, filt_h1; |
||
3992 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
3993 | v16i8 mask1; |
||
3994 | v8i16 filter_vec, const_vec; |
||
3995 | v16i8 vec0, vec1, vec2, vec3, vec4, vec5; |
||
3996 | v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; |
||
3997 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; |
||
3998 | v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; |
||
3999 | v4i32 dst4_r, dst4_l, dst5_r, dst5_l; |
||
4000 | v8i16 dst10_r, dst32_r, dst10_l, dst32_l; |
||
4001 | v8i16 dst21_r, dst43_r, dst21_l, dst43_l; |
||
4002 | v8i16 dst54_r, dst54_l, dst65_r, dst65_l; |
||
4003 | v8i16 dst76_r, dst76_l, dst87_r, dst87_l; |
||
4004 | |||
4005 | src0_ptr -= (src_stride + 1); |
||
4006 | |||
4007 | filter_vec = LD_SH(filter_x); |
||
4008 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
4009 | |||
4010 | filter_vec = LD_SH(filter_y); |
||
4011 | vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); |
||
4012 | filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); |
||
4013 | |||
4014 | SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); |
||
4015 | |||
4016 | mask1 = mask0 + 2; |
||
4017 | |||
4018 | const_vec = __msa_ldi_h(128); |
||
4019 | const_vec <<= 6; |
||
4020 | |||
4021 | LD_SB3(src0_ptr, src_stride, src0, src1, src2); |
||
4022 | src0_ptr += (3 * src_stride); |
||
4023 | XORI_B3_128_SB(src0, src1, src2); |
||
4024 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
4025 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); |
||
4026 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); |
||
4027 | dst0 = const_vec; |
||
4028 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
4029 | dst1 = const_vec; |
||
4030 | DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); |
||
4031 | dst2 = const_vec; |
||
4032 | DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); |
||
4033 | |||
4034 | ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); |
||
4035 | ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); |
||
4036 | |||
4037 | LD_SB2(src0_ptr, src_stride, src3, src4); |
||
4038 | src0_ptr += (2 * src_stride); |
||
4039 | XORI_B2_128_SB(src3, src4); |
||
4040 | LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); |
||
4041 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
4042 | dst3 = const_vec; |
||
4043 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
4044 | |||
4045 | ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); |
||
4046 | dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); |
||
4047 | dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); |
||
4048 | dst0_r >>= 6; |
||
4049 | dst0_l >>= 6; |
||
4050 | tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); |
||
4051 | |||
4052 | VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); |
||
4053 | dst4 = const_vec; |
||
4054 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
4055 | |||
4056 | ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); |
||
4057 | dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); |
||
4058 | dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); |
||
4059 | dst1_r >>= 6; |
||
4060 | dst1_l >>= 6; |
||
4061 | tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r); |
||
4062 | |||
4063 | LD_SB2(src0_ptr, src_stride, src5, src6); |
||
4064 | src0_ptr += (2 * src_stride); |
||
4065 | XORI_B2_128_SB(src5, src6); |
||
4066 | /* row 5 */ |
||
4067 | VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); |
||
4068 | dst5 = const_vec; |
||
4069 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); |
||
4070 | |||
4071 | ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); |
||
4072 | dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); |
||
4073 | dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); |
||
4074 | dst2_r >>= 6; |
||
4075 | dst2_l >>= 6; |
||
4076 | tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); |
||
4077 | |||
4078 | /* row 6 */ |
||
4079 | VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); |
||
4080 | dst6 = const_vec; |
||
4081 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); |
||
4082 | |||
4083 | ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); |
||
4084 | dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); |
||
4085 | dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); |
||
4086 | dst3_r >>= 6; |
||
4087 | dst3_l >>= 6; |
||
4088 | tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r); |
||
4089 | |||
4090 | LD_SB2(src0_ptr, src_stride, src7, src8); |
||
4091 | XORI_B2_128_SB(src7, src8); |
||
4092 | /* row 7 */ |
||
4093 | VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); |
||
4094 | dst7 = const_vec; |
||
4095 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); |
||
4096 | |||
4097 | ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); |
||
4098 | dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); |
||
4099 | dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); |
||
4100 | |||
4101 | dst4_r >>= 6; |
||
4102 | dst4_l >>= 6; |
||
4103 | tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r); |
||
4104 | /* row 8 */ |
||
4105 | VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); |
||
4106 | dst8 = const_vec; |
||
4107 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); |
||
4108 | |||
4109 | ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); |
||
4110 | dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); |
||
4111 | dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); |
||
4112 | dst5_r >>= 6; |
||
4113 | dst5_l >>= 6; |
||
4114 | tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r); |
||
4115 | |||
4116 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
4117 | tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3); |
||
4118 | HEVC_BI_RND_CLIP2(in4, in5, tmp4, tmp5, 7, tmp4, tmp5); |
||
4119 | |||
4120 | PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r); |
||
4121 | dst2_r = (v4i32) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); |
||
4122 | ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); |
||
4123 | dst += (4 * dst_stride); |
||
4124 | ST8x2_UB(dst2_r, dst, dst_stride); |
||
4125 | } |
||
4126 | |||
4127 | static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr, |
||
4128 | int32_t src_stride, |
||
4129 | int16_t *src1_ptr, |
||
4130 | int32_t src2_stride, |
||
4131 | uint8_t *dst, |
||
4132 | int32_t dst_stride, |
||
4133 | const int8_t *filter_x, |
||
4134 | const int8_t *filter_y, |
||
4135 | int32_t height, |
||
4136 | int32_t width) |
||
4137 | { |
||
4138 | uint32_t loop_cnt, cnt; |
||
4139 | uint8_t *src0_ptr_tmp; |
||
4140 | int16_t *src1_ptr_tmp; |
||
4141 | uint8_t *dst_tmp; |
||
4142 | v16i8 src0, src1, src2, src3, src4, src5, src6; |
||
4143 | v8i16 in0, in1, in2, in3; |
||
4144 | v8i16 filt0, filt1; |
||
4145 | v4i32 filt_h0, filt_h1; |
||
4146 | v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
||
4147 | v16i8 mask1; |
||
4148 | v8i16 filter_vec, const_vec; |
||
4149 | v16i8 vec0, vec1, vec2, vec3, vec4, vec5; |
||
4150 | v8i16 dst0, dst1, dst2, dst3, dst4, dst5; |
||
4151 | v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; |
||
4152 | v8i16 tmp0, tmp1, tmp2, tmp3; |
||
4153 | v8i16 dst10_r, dst32_r, dst21_r, dst43_r; |
||
4154 | v8i16 dst10_l, dst32_l, dst21_l, dst43_l; |
||
4155 | |||
4156 | src0_ptr -= (src_stride + 1); |
||
4157 | |||
4158 | filter_vec = LD_SH(filter_x); |
||
4159 | SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); |
||
4160 | |||
4161 | filter_vec = LD_SH(filter_y); |
||
4162 | vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); |
||
4163 | filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); |
||
4164 | |||
4165 | SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); |
||
4166 | |||
4167 | mask1 = mask0 + 2; |
||
4168 | |||
4169 | const_vec = __msa_ldi_h(128); |
||
4170 | const_vec <<= 6; |
||
4171 | |||
4172 | for (cnt = width >> 3; cnt--;) { |
||
4173 | src0_ptr_tmp = src0_ptr; |
||
4174 | dst_tmp = dst; |
||
4175 | src1_ptr_tmp = src1_ptr; |
||
4176 | |||
4177 | LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); |
||
4178 | src0_ptr_tmp += (3 * src_stride); |
||
4179 | XORI_B3_128_SB(src0, src1, src2); |
||
4180 | |||
4181 | VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); |
||
4182 | VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); |
||
4183 | VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); |
||
4184 | dst0 = const_vec; |
||
4185 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); |
||
4186 | dst1 = const_vec; |
||
4187 | DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); |
||
4188 | dst2 = const_vec; |
||
4189 | DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); |
||
4190 | |||
4191 | ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); |
||
4192 | ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); |
||
4193 | |||
4194 | for (loop_cnt = height >> 2; loop_cnt--;) { |
||
4195 | LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); |
||
4196 | src0_ptr_tmp += (4 * src_stride); |
||
4197 | LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); |
||
4198 | src1_ptr_tmp += (4 * src2_stride); |
||
4199 | XORI_B4_128_SB(src3, src4, src5, src6); |
||
4200 | |||
4201 | VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); |
||
4202 | dst3 = const_vec; |
||
4203 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); |
||
4204 | |||
4205 | ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); |
||
4206 | dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); |
||
4207 | dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); |
||
4208 | dst0_r >>= 6; |
||
4209 | dst0_l >>= 6; |
||
4210 | |||
4211 | VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); |
||
4212 | dst4 = const_vec; |
||
4213 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); |
||
4214 | |||
4215 | ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); |
||
4216 | dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); |
||
4217 | dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); |
||
4218 | dst1_r >>= 6; |
||
4219 | dst1_l >>= 6; |
||
4220 | |||
4221 | VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); |
||
4222 | dst5 = const_vec; |
||
4223 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); |
||
4224 | |||
4225 | ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); |
||
4226 | dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); |
||
4227 | dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); |
||
4228 | dst2_r >>= 6; |
||
4229 | dst2_l >>= 6; |
||
4230 | |||
4231 | VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); |
||
4232 | dst2 = const_vec; |
||
4233 | DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); |
||
4234 | |||
4235 | ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); |
||
4236 | dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); |
||
4237 | dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); |
||
4238 | dst3_r >>= 6; |
||
4239 | dst3_l >>= 6; |
||
4240 | |||
4241 | PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, |
||
4242 | dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); |
||
4243 | HEVC_BI_RND_CLIP4(in0, in1, in2, in3, |
||
4244 | tmp0, tmp1, tmp2, tmp3, 7, |
||
4245 | tmp0, tmp1, tmp2, tmp3); |
||
4246 | |||
4247 | PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r); |
||
4248 | ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride); |
||
4249 | dst_tmp += (4 * dst_stride); |
||
4250 | } |
||
4251 | |||
4252 | src0_ptr += 8; |
||
4253 | dst += 8; |
||
4254 | src1_ptr += 8; |
||
4255 | } |
||
4256 | } |
||
4257 | |||
4258 | static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr, |
||
4259 | int32_t src_stride, |
||
4260 | int16_t *src1_ptr, |
||
4261 | int32_t src2_stride, |
||
4262 | uint8_t *dst, |
||
4263 | int32_t dst_stride, |
||
4264 | const int8_t *filter_x, |
||
4265 | const int8_t *filter_y, |
||
4266 | int32_t height) |
||
4267 | { |
||
4268 | if (2 == height) { |
||
4269 | hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
4270 | dst, dst_stride, filter_x, filter_y, height); |
||
4271 | } else if (6 == height) { |
||
4272 | hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
4273 | dst, dst_stride, filter_x, filter_y, height); |
||
4274 | } else { |
||
4275 | hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, |
||
4276 | src1_ptr, src2_stride, |
||
4277 | dst, dst_stride, |
||
4278 | filter_x, filter_y, height, 8); |
||
4279 | } |
||
4280 | } |
||
4281 | |||
4282 | static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr, |
||
4283 | int32_t src_stride, |
||
4284 | int16_t *src1_ptr, |
||
4285 | int32_t src2_stride, |
||
4286 | uint8_t *dst, |
||
4287 | int32_t dst_stride, |
||
4288 | const int8_t *filter_x, |
||
4289 | const int8_t *filter_y, |
||
4290 | int32_t height) |
||
4291 | { |
||
4292 | hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
4293 | dst, dst_stride, filter_x, filter_y, |
||
4294 | height, 8); |
||
4295 | hevc_hv_bi_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride, |
||
4296 | dst + 8, dst_stride, filter_x, filter_y, height); |
||
4297 | } |
||
4298 | |||
4299 | static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr, |
||
4300 | int32_t src_stride, |
||
4301 | int16_t *src1_ptr, |
||
4302 | int32_t src2_stride, |
||
4303 | uint8_t *dst, |
||
4304 | int32_t dst_stride, |
||
4305 | const int8_t *filter_x, |
||
4306 | const int8_t *filter_y, |
||
4307 | int32_t height) |
||
4308 | { |
||
4309 | hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
4310 | dst, dst_stride, filter_x, filter_y, |
||
4311 | height, 16); |
||
4312 | } |
||
4313 | |||
4314 | static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr, |
||
4315 | int32_t src_stride, |
||
4316 | int16_t *src1_ptr, |
||
4317 | int32_t src2_stride, |
||
4318 | uint8_t *dst, |
||
4319 | int32_t dst_stride, |
||
4320 | const int8_t *filter_x, |
||
4321 | const int8_t *filter_y, |
||
4322 | int32_t height) |
||
4323 | { |
||
4324 | hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
4325 | dst, dst_stride, filter_x, filter_y, |
||
4326 | height, 24); |
||
4327 | } |
||
4328 | |||
4329 | static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr, |
||
4330 | int32_t src_stride, |
||
4331 | int16_t *src1_ptr, |
||
4332 | int32_t src2_stride, |
||
4333 | uint8_t *dst, |
||
4334 | int32_t dst_stride, |
||
4335 | const int8_t *filter_x, |
||
4336 | const const int8_t *filter_y, |
||
4337 | int32_t height) |
||
4338 | { |
||
4339 | hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, |
||
4340 | dst, dst_stride, filter_x, filter_y, |
||
4341 | height, 32); |
||
4342 | } |
||
4343 | |||
4344 | #define BI_MC_COPY(WIDTH) \ |
||
4345 | void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ |
||
4346 | ptrdiff_t dst_stride, \ |
||
4347 | uint8_t *src, \ |
||
4348 | ptrdiff_t src_stride, \ |
||
4349 | int16_t *src_16bit, \ |
||
4350 | int height, \ |
||
4351 | intptr_t mx, \ |
||
4352 | intptr_t my, \ |
||
4353 | int width) \ |
||
4354 | { \ |
||
4355 | hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \ |
||
4356 | dst, dst_stride, height); \ |
||
4357 | } |
||
4358 | |||
4359 | BI_MC_COPY(4); |
||
4360 | BI_MC_COPY(6); |
||
4361 | BI_MC_COPY(8); |
||
4362 | BI_MC_COPY(12); |
||
4363 | BI_MC_COPY(16); |
||
4364 | BI_MC_COPY(24); |
||
4365 | BI_MC_COPY(32); |
||
4366 | BI_MC_COPY(48); |
||
4367 | BI_MC_COPY(64); |
||
4368 | |||
4369 | #undef BI_MC_COPY |
||
4370 | |||
4371 | #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ |
||
4372 | void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ |
||
4373 | ptrdiff_t dst_stride, \ |
||
4374 | uint8_t *src, \ |
||
4375 | ptrdiff_t src_stride, \ |
||
4376 | int16_t *src_16bit, \ |
||
4377 | int height, \ |
||
4378 | intptr_t mx, \ |
||
4379 | intptr_t my, \ |
||
4380 | int width) \ |
||
4381 | { \ |
||
4382 | const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ |
||
4383 | \ |
||
4384 | hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ |
||
4385 | MAX_PB_SIZE, dst, dst_stride, \ |
||
4386 | filter, height); \ |
||
4387 | } |
||
4388 | |||
4389 | BI_MC(qpel, h, 4, 8, hz, mx); |
||
4390 | BI_MC(qpel, h, 8, 8, hz, mx); |
||
4391 | BI_MC(qpel, h, 12, 8, hz, mx); |
||
4392 | BI_MC(qpel, h, 16, 8, hz, mx); |
||
4393 | BI_MC(qpel, h, 24, 8, hz, mx); |
||
4394 | BI_MC(qpel, h, 32, 8, hz, mx); |
||
4395 | BI_MC(qpel, h, 48, 8, hz, mx); |
||
4396 | BI_MC(qpel, h, 64, 8, hz, mx); |
||
4397 | |||
4398 | BI_MC(qpel, v, 4, 8, vt, my); |
||
4399 | BI_MC(qpel, v, 8, 8, vt, my); |
||
4400 | BI_MC(qpel, v, 12, 8, vt, my); |
||
4401 | BI_MC(qpel, v, 16, 8, vt, my); |
||
4402 | BI_MC(qpel, v, 24, 8, vt, my); |
||
4403 | BI_MC(qpel, v, 32, 8, vt, my); |
||
4404 | BI_MC(qpel, v, 48, 8, vt, my); |
||
4405 | BI_MC(qpel, v, 64, 8, vt, my); |
||
4406 | |||
4407 | BI_MC(epel, h, 4, 4, hz, mx); |
||
4408 | BI_MC(epel, h, 8, 4, hz, mx); |
||
4409 | BI_MC(epel, h, 6, 4, hz, mx); |
||
4410 | BI_MC(epel, h, 12, 4, hz, mx); |
||
4411 | BI_MC(epel, h, 16, 4, hz, mx); |
||
4412 | BI_MC(epel, h, 24, 4, hz, mx); |
||
4413 | BI_MC(epel, h, 32, 4, hz, mx); |
||
4414 | |||
4415 | BI_MC(epel, v, 4, 4, vt, my); |
||
4416 | BI_MC(epel, v, 8, 4, vt, my); |
||
4417 | BI_MC(epel, v, 6, 4, vt, my); |
||
4418 | BI_MC(epel, v, 12, 4, vt, my); |
||
4419 | BI_MC(epel, v, 16, 4, vt, my); |
||
4420 | BI_MC(epel, v, 24, 4, vt, my); |
||
4421 | BI_MC(epel, v, 32, 4, vt, my); |
||
4422 | |||
4423 | #undef BI_MC |
||
4424 | |||
4425 | #define BI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \ |
||
4426 | void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ |
||
4427 | ptrdiff_t dst_stride, \ |
||
4428 | uint8_t *src, \ |
||
4429 | ptrdiff_t src_stride, \ |
||
4430 | int16_t *src_16bit, \ |
||
4431 | int height, \ |
||
4432 | intptr_t mx, \ |
||
4433 | intptr_t my, \ |
||
4434 | int width) \ |
||
4435 | { \ |
||
4436 | const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ |
||
4437 | const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ |
||
4438 | \ |
||
4439 | hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ |
||
4440 | MAX_PB_SIZE, dst, dst_stride, \ |
||
4441 | filter_x, filter_y, \ |
||
4442 | height); \ |
||
4443 | } |
||
4444 | |||
4445 | BI_MC_HV(qpel, hv, 4, 8, hv); |
||
4446 | BI_MC_HV(qpel, hv, 8, 8, hv); |
||
4447 | BI_MC_HV(qpel, hv, 12, 8, hv); |
||
4448 | BI_MC_HV(qpel, hv, 16, 8, hv); |
||
4449 | BI_MC_HV(qpel, hv, 24, 8, hv); |
||
4450 | BI_MC_HV(qpel, hv, 32, 8, hv); |
||
4451 | BI_MC_HV(qpel, hv, 48, 8, hv); |
||
4452 | BI_MC_HV(qpel, hv, 64, 8, hv); |
||
4453 | |||
4454 | BI_MC_HV(epel, hv, 4, 4, hv); |
||
4455 | BI_MC_HV(epel, hv, 8, 4, hv); |
||
4456 | BI_MC_HV(epel, hv, 6, 4, hv); |
||
4457 | BI_MC_HV(epel, hv, 12, 4, hv); |
||
4458 | BI_MC_HV(epel, hv, 16, 4, hv); |
||
4459 | BI_MC_HV(epel, hv, 24, 4, hv); |
||
4460 | BI_MC_HV(epel, hv, 32, 4, hv); |
||
4461 | |||
4462 | #undef BI_MC_HV=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=>=><=> |