Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6147 | serge | 1 | /* |
2 | * Copyright (c) 2014 RISC OS Open Ltd |
||
3 | * Author: Ben Avison |
||
4 | * |
||
5 | * This file is part of FFmpeg. |
||
6 | * |
||
7 | * FFmpeg is free software; you can redistribute it and/or |
||
8 | * modify it under the terms of the GNU Lesser General Public |
||
9 | * License as published by the Free Software Foundation; either |
||
10 | * version 2.1 of the License, or (at your option) any later version. |
||
11 | * |
||
12 | * FFmpeg is distributed in the hope that it will be useful, |
||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | * Lesser General Public License for more details. |
||
16 | * |
||
17 | * You should have received a copy of the GNU Lesser General Public |
||
18 | * License along with FFmpeg; if not, write to the Free Software |
||
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
20 | */ |
||
21 | |||
22 | #include "libavutil/arm/asm.S" |
||
23 | |||
24 | #define MAX_CHANNELS 8 |
||
25 | #define MAX_FIR_ORDER 8 |
||
26 | #define MAX_IIR_ORDER 4 |
||
27 | #define MAX_RATEFACTOR 4 |
||
28 | #define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR) |
||
29 | |||
30 | PST .req a1 |
||
31 | PCO .req a2 |
||
32 | AC0 .req a3 |
||
33 | AC1 .req a4 |
||
34 | CO0 .req v1 |
||
35 | CO1 .req v2 |
||
36 | CO2 .req v3 |
||
37 | CO3 .req v4 |
||
38 | ST0 .req v5 |
||
39 | ST1 .req v6 |
||
40 | ST2 .req sl |
||
41 | ST3 .req fp |
||
42 | I .req ip |
||
43 | PSAMP .req lr |
||
44 | |||
45 | |||
46 | .macro branch_pic_label first, remainder:vararg |
||
47 | A .word \first - 4 |
||
48 | T .hword (\first) / 2 |
||
49 | .ifnb \remainder |
||
50 | branch_pic_label \remainder |
||
51 | .endif |
||
52 | .endm |
||
53 | |||
54 | // Some macros that do loads/multiplies where the register number is determined |
||
55 | // from an assembly-time expression. Boy is GNU assembler's syntax ugly... |
||
56 | |||
57 | .macro load group, index, base, offset |
||
58 | .altmacro |
||
59 | load_ \group, %(\index), \base, \offset |
||
60 | .noaltmacro |
||
61 | .endm |
||
62 | |||
63 | .macro load_ group, index, base, offset |
||
64 | ldr \group\index, [\base, #\offset] |
||
65 | .endm |
||
66 | |||
67 | .macro loadd group, index, base, offset |
||
68 | .altmacro |
||
69 | loadd_ \group, %(\index), %(\index+1), \base, \offset |
||
70 | .noaltmacro |
||
71 | .endm |
||
72 | |||
73 | .macro loadd_ group, index0, index1, base, offset |
||
74 | A .if \offset >= 256 |
||
75 | A ldr \group\index0, [\base, #\offset] |
||
76 | A ldr \group\index1, [\base, #(\offset) + 4] |
||
77 | A .else |
||
78 | ldrd \group\index0, \group\index1, [\base, #\offset] |
||
79 | A .endif |
||
80 | .endm |
||
81 | |||
82 | .macro multiply index, accumulate, long |
||
83 | .altmacro |
||
84 | multiply_ %(\index), \accumulate, \long |
||
85 | .noaltmacro |
||
86 | .endm |
||
87 | |||
88 | .macro multiply_ index, accumulate, long |
||
89 | .if \long |
||
90 | .if \accumulate |
||
91 | smlal AC0, AC1, CO\index, ST\index |
||
92 | .else |
||
93 | smull AC0, AC1, CO\index, ST\index |
||
94 | .endif |
||
95 | .else |
||
96 | .if \accumulate |
||
97 | mla AC0, CO\index, ST\index, AC0 |
||
98 | .else |
||
99 | mul AC0, CO\index, ST\index |
||
100 | .endif |
||
101 | .endif |
||
102 | .endm |
||
103 | |||
104 | // A macro to update the load register number and load offsets |
||
105 | |||
106 | .macro inc howmany |
||
107 | .set LOAD_REG, (LOAD_REG + \howmany) & 3 |
||
108 | .set OFFSET_CO, OFFSET_CO + 4 * \howmany |
||
109 | .set OFFSET_ST, OFFSET_ST + 4 * \howmany |
||
110 | .if FIR_REMAIN > 0 |
||
111 | .set FIR_REMAIN, FIR_REMAIN - \howmany |
||
112 | .if FIR_REMAIN == 0 |
||
113 | .set OFFSET_CO, 4 * MAX_FIR_ORDER |
||
114 | .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER) |
||
115 | .endif |
||
116 | .elseif IIR_REMAIN > 0 |
||
117 | .set IIR_REMAIN, IIR_REMAIN - \howmany |
||
118 | .endif |
||
119 | .endm |
||
120 | |||
121 | // Macro to implement the inner loop for one specific combination of parameters |
||
122 | |||
123 | .macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps |
||
124 | .set TOTAL_TAPS, \iir_taps + \fir_taps |
||
125 | |||
126 | // Deal with register allocation... |
||
127 | .set DEFINED_SHIFT, 0 |
||
128 | .set DEFINED_MASK, 0 |
||
129 | .set SHUFFLE_SHIFT, 0 |
||
130 | .set SHUFFLE_MASK, 0 |
||
131 | .set SPILL_SHIFT, 0 |
||
132 | .set SPILL_MASK, 0 |
||
133 | .if TOTAL_TAPS == 0 |
||
134 | // Little register pressure in this case - just keep MASK where it was |
||
135 | .if !\mask_minus1 |
||
136 | MASK .req ST1 |
||
137 | .set DEFINED_MASK, 1 |
||
138 | .endif |
||
139 | .else |
||
140 | .if \shift_0 |
||
141 | .if !\mask_minus1 |
||
142 | // AC1 is unused with shift 0 |
||
143 | MASK .req AC1 |
||
144 | .set DEFINED_MASK, 1 |
||
145 | .set SHUFFLE_MASK, 1 |
||
146 | .endif |
||
147 | .elseif \shift_8 |
||
148 | .if !\mask_minus1 |
||
149 | .if TOTAL_TAPS <= 4 |
||
150 | // All coefficients are preloaded (so pointer not needed) |
||
151 | MASK .req PCO |
||
152 | .set DEFINED_MASK, 1 |
||
153 | .set SHUFFLE_MASK, 1 |
||
154 | .else |
||
155 | .set SPILL_MASK, 1 |
||
156 | .endif |
||
157 | .endif |
||
158 | .else // shift not 0 or 8 |
||
159 | .if TOTAL_TAPS <= 3 |
||
160 | // All coefficients are preloaded, and at least one CO register is unused |
||
161 | .if \fir_taps & 1 |
||
162 | SHIFT .req CO0 |
||
163 | .set DEFINED_SHIFT, 1 |
||
164 | .set SHUFFLE_SHIFT, 1 |
||
165 | .else |
||
166 | SHIFT .req CO3 |
||
167 | .set DEFINED_SHIFT, 1 |
||
168 | .set SHUFFLE_SHIFT, 1 |
||
169 | .endif |
||
170 | .if !\mask_minus1 |
||
171 | MASK .req PCO |
||
172 | .set DEFINED_MASK, 1 |
||
173 | .set SHUFFLE_MASK, 1 |
||
174 | .endif |
||
175 | .elseif TOTAL_TAPS == 4 |
||
176 | // All coefficients are preloaded |
||
177 | SHIFT .req PCO |
||
178 | .set DEFINED_SHIFT, 1 |
||
179 | .set SHUFFLE_SHIFT, 1 |
||
180 | .if !\mask_minus1 |
||
181 | .set SPILL_MASK, 1 |
||
182 | .endif |
||
183 | .else |
||
184 | .set SPILL_SHIFT, 1 |
||
185 | .if !\mask_minus1 |
||
186 | .set SPILL_MASK, 1 |
||
187 | .endif |
||
188 | .endif |
||
189 | .endif |
||
190 | .endif |
||
191 | .if SPILL_SHIFT |
||
192 | SHIFT .req ST0 |
||
193 | .set DEFINED_SHIFT, 1 |
||
194 | .endif |
||
195 | .if SPILL_MASK |
||
196 | MASK .req ST1 |
||
197 | .set DEFINED_MASK, 1 |
||
198 | .endif |
||
199 | |||
200 | // Preload coefficients if possible |
||
201 | .if TOTAL_TAPS <= 4 |
||
202 | .set OFFSET_CO, 0 |
||
203 | .if \fir_taps & 1 |
||
204 | .set LOAD_REG, 1 |
||
205 | .else |
||
206 | .set LOAD_REG, 0 |
||
207 | .endif |
||
208 | .rept \fir_taps |
||
209 | load CO, LOAD_REG, PCO, OFFSET_CO |
||
210 | .set LOAD_REG, (LOAD_REG + 1) & 3 |
||
211 | .set OFFSET_CO, OFFSET_CO + 4 |
||
212 | .endr |
||
213 | .set OFFSET_CO, 4 * MAX_FIR_ORDER |
||
214 | .rept \iir_taps |
||
215 | load CO, LOAD_REG, PCO, OFFSET_CO |
||
216 | .set LOAD_REG, (LOAD_REG + 1) & 3 |
||
217 | .set OFFSET_CO, OFFSET_CO + 4 |
||
218 | .endr |
||
219 | .endif |
||
220 | |||
221 | // Move mask/shift to final positions if necessary |
||
222 | // Need to do this after preloading, because in some cases we |
||
223 | // reuse the coefficient pointer register |
||
224 | .if SHUFFLE_SHIFT |
||
225 | mov SHIFT, ST0 |
||
226 | .endif |
||
227 | .if SHUFFLE_MASK |
||
228 | mov MASK, ST1 |
||
229 | .endif |
||
230 | |||
231 | // Begin loop |
||
232 | 01: |
||
233 | .if TOTAL_TAPS == 0 |
||
234 | // Things simplify a lot in this case |
||
235 | // In fact this could be pipelined further if it's worth it... |
||
236 | ldr ST0, [PSAMP] |
||
237 | subs I, I, #1 |
||
238 | .if !\mask_minus1 |
||
239 | and ST0, ST0, MASK |
||
240 | .endif |
||
241 | str ST0, [PST, #-4]! |
||
242 | str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)] |
||
243 | str ST0, [PSAMP], #4 * MAX_CHANNELS |
||
244 | bne 01b |
||
245 | .else |
||
246 | .if \fir_taps & 1 |
||
247 | .set LOAD_REG, 1 |
||
248 | .else |
||
249 | .set LOAD_REG, 0 |
||
250 | .endif |
||
251 | .set LOAD_BANK, 0 |
||
252 | .set FIR_REMAIN, \fir_taps |
||
253 | .set IIR_REMAIN, \iir_taps |
||
254 | .if FIR_REMAIN == 0 // only IIR terms |
||
255 | .set OFFSET_CO, 4 * MAX_FIR_ORDER |
||
256 | .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER) |
||
257 | .else |
||
258 | .set OFFSET_CO, 0 |
||
259 | .set OFFSET_ST, 0 |
||
260 | .endif |
||
261 | .set MUL_REG, LOAD_REG |
||
262 | .set COUNTER, 0 |
||
263 | .rept TOTAL_TAPS + 2 |
||
264 | // Do load(s) |
||
265 | .if FIR_REMAIN != 0 || IIR_REMAIN != 0 |
||
266 | .if COUNTER == 0 |
||
267 | .if TOTAL_TAPS > 4 |
||
268 | load CO, LOAD_REG, PCO, OFFSET_CO |
||
269 | .endif |
||
270 | load ST, LOAD_REG, PST, OFFSET_ST |
||
271 | inc 1 |
||
272 | .elseif COUNTER == 1 && (\fir_taps & 1) == 0 |
||
273 | .if TOTAL_TAPS > 4 |
||
274 | load CO, LOAD_REG, PCO, OFFSET_CO |
||
275 | .endif |
||
276 | load ST, LOAD_REG, PST, OFFSET_ST |
||
277 | inc 1 |
||
278 | .elseif LOAD_BANK == 0 |
||
279 | .if TOTAL_TAPS > 4 |
||
280 | .if FIR_REMAIN == 0 && IIR_REMAIN == 1 |
||
281 | load CO, LOAD_REG, PCO, OFFSET_CO |
||
282 | .else |
||
283 | loadd CO, LOAD_REG, PCO, OFFSET_CO |
||
284 | .endif |
||
285 | .endif |
||
286 | .set LOAD_BANK, 1 |
||
287 | .else |
||
288 | .if FIR_REMAIN == 0 && IIR_REMAIN == 1 |
||
289 | load ST, LOAD_REG, PST, OFFSET_ST |
||
290 | inc 1 |
||
291 | .else |
||
292 | loadd ST, LOAD_REG, PST, OFFSET_ST |
||
293 | inc 2 |
||
294 | .endif |
||
295 | .set LOAD_BANK, 0 |
||
296 | .endif |
||
297 | .endif |
||
298 | |||
299 | // Do interleaved multiplies, slightly delayed |
||
300 | .if COUNTER >= 2 |
||
301 | multiply MUL_REG, COUNTER > 2, !\shift_0 |
||
302 | .set MUL_REG, (MUL_REG + 1) & 3 |
||
303 | .endif |
||
304 | .set COUNTER, COUNTER + 1 |
||
305 | .endr |
||
306 | |||
307 | // Post-process the result of the multiplies |
||
308 | .if SPILL_SHIFT |
||
309 | ldr SHIFT, [sp, #9*4 + 0*4] |
||
310 | .endif |
||
311 | .if SPILL_MASK |
||
312 | ldr MASK, [sp, #9*4 + 1*4] |
||
313 | .endif |
||
314 | ldr ST2, [PSAMP] |
||
315 | subs I, I, #1 |
||
316 | .if \shift_8 |
||
317 | mov AC0, AC0, lsr #8 |
||
318 | orr AC0, AC0, AC1, lsl #24 |
||
319 | .elseif !\shift_0 |
||
320 | rsb ST3, SHIFT, #32 |
||
321 | mov AC0, AC0, lsr SHIFT |
||
322 | A orr AC0, AC0, AC1, lsl ST3 |
||
323 | T mov AC1, AC1, lsl ST3 |
||
324 | T orr AC0, AC0, AC1 |
||
325 | .endif |
||
326 | .if \mask_minus1 |
||
327 | add ST3, ST2, AC0 |
||
328 | .else |
||
329 | add ST2, ST2, AC0 |
||
330 | and ST3, ST2, MASK |
||
331 | sub ST2, ST3, AC0 |
||
332 | .endif |
||
333 | str ST3, [PST, #-4]! |
||
334 | str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)] |
||
335 | str ST3, [PSAMP], #4 * MAX_CHANNELS |
||
336 | bne 01b |
||
337 | .endif |
||
338 | b 99f |
||
339 | |||
340 | .if DEFINED_SHIFT |
||
341 | .unreq SHIFT |
||
342 | .endif |
||
343 | .if DEFINED_MASK |
||
344 | .unreq MASK |
||
345 | .endif |
||
346 | .endm |
||
347 | |||
348 | .macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps |
||
349 | A ldr CO0, [pc, a3, lsl #2] // firorder is in range 0-(8-iir_taps) |
||
350 | A add pc, pc, CO0 |
||
351 | T tbh [pc, a3, lsl #1] |
||
352 | 0: |
||
353 | branch_pic_label (70f - 0b), (71f - 0b), (72f - 0b), (73f - 0b) |
||
354 | branch_pic_label (74f - 0b) |
||
355 | .if \iir_taps <= 3 |
||
356 | branch_pic_label (75f - 0b) |
||
357 | .if \iir_taps <= 2 |
||
358 | branch_pic_label (76f - 0b) |
||
359 | .if \iir_taps <= 1 |
||
360 | branch_pic_label (77f - 0b) |
||
361 | .if \iir_taps == 0 |
||
362 | branch_pic_label (78f - 0b) |
||
363 | .endif |
||
364 | .endif |
||
365 | .endif |
||
366 | .endif |
||
367 | 70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0 |
||
368 | 71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1 |
||
369 | 72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2 |
||
370 | 73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3 |
||
371 | 74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4 |
||
372 | .if \iir_taps <= 3 |
||
373 | 75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5 |
||
374 | .if \iir_taps <= 2 |
||
375 | 76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6 |
||
376 | .if \iir_taps <= 1 |
||
377 | 77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7 |
||
378 | .if \iir_taps == 0 |
||
379 | 78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8 |
||
380 | .endif |
||
381 | .endif |
||
382 | .endif |
||
383 | .endif |
||
384 | .endm |
||
385 | |||
386 | .macro switch_on_iir_taps mask_minus1, shift_0, shift_8 |
||
387 | A ldr CO0, [pc, a4, lsl #2] // irorder is in range 0-4 |
||
388 | A add pc, pc, CO0 |
||
389 | T tbh [pc, a4, lsl #1] |
||
390 | 0: |
||
391 | branch_pic_label (60f - 0b), (61f - 0b), (62f - 0b), (63f - 0b) |
||
392 | branch_pic_label (64f - 0b) |
||
393 | 60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0 |
||
394 | 61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1 |
||
395 | 62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2 |
||
396 | 63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3 |
||
397 | 64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4 |
||
398 | .endm |
||
399 | |||
400 | /* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff, |
||
401 | * int firorder, int iirorder, |
||
402 | * unsigned int filter_shift, int32_t mask, |
||
403 | * int blocksize, int32_t *sample_buffer); |
||
404 | */ |
||
405 | function ff_mlp_filter_channel_arm, export=1 |
||
406 | push {v1-fp,lr} |
||
407 | add v1, sp, #9*4 // point at arguments on stack |
||
408 | ldm v1, {ST0,ST1,I,PSAMP} |
||
409 | cmp ST1, #-1 |
||
410 | bne 30f |
||
411 | movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8 |
||
412 | bne 20f |
||
413 | bcs 10f |
||
414 | switch_on_iir_taps 1, 1, 0 |
||
415 | 10: switch_on_iir_taps 1, 0, 1 |
||
416 | 20: switch_on_iir_taps 1, 0, 0 |
||
417 | 30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8 |
||
418 | bne 50f |
||
419 | bcs 40f |
||
420 | switch_on_iir_taps 0, 1, 0 |
||
421 | 40: switch_on_iir_taps 0, 0, 1 |
||
422 | 50: switch_on_iir_taps 0, 0, 0 |
||
423 | 99: pop {v1-fp,pc} |
||
424 | endfunc |
||
425 | |||
426 | .unreq PST |
||
427 | .unreq PCO |
||
428 | .unreq AC0 |
||
429 | .unreq AC1 |
||
430 | .unreq CO0 |
||
431 | .unreq CO1 |
||
432 | .unreq CO2 |
||
433 | .unreq CO3 |
||
434 | .unreq ST0 |
||
435 | .unreq ST1 |
||
436 | .unreq ST2 |
||
437 | .unreq ST3 |
||
438 | .unreq I |
||
439 | .unreq PSAMP |
||
440 | |||
441 | /********************************************************************/ |
||
442 | |||
443 | PSA .req a1 // samples |
||
444 | PCO .req a2 // coeffs |
||
445 | PBL .req a3 // bypassed_lsbs |
||
446 | INDEX .req a4 |
||
447 | CO0 .req v1 |
||
448 | CO1 .req v2 |
||
449 | CO2 .req v3 |
||
450 | CO3 .req v4 |
||
451 | SA0 .req v5 |
||
452 | SA1 .req v6 |
||
453 | SA2 .req sl |
||
454 | SA3 .req fp |
||
455 | AC0 .req ip |
||
456 | AC1 .req lr |
||
457 | NOISE .req SA0 |
||
458 | LSB .req SA1 |
||
459 | DCH .req SA2 // dest_ch |
||
460 | MASK .req SA3 |
||
461 | |||
462 | // INDEX is used as follows: |
||
463 | // bits 0..6 index2 (values up to 17, but wider so that we can |
||
464 | // add to index field without needing to mask) |
||
465 | // bits 7..14 i (values up to 160) |
||
466 | // bit 15 underflow detect for i |
||
467 | // bits 25..31 (if access_unit_size_pow2 == 128) \ index |
||
468 | // bits 26..31 (if access_unit_size_pow2 == 64) / |
||
469 | |||
470 | .macro implement_rematrix shift, index_mask, mask_minus1, maxchan |
||
471 | .if \maxchan == 1 |
||
472 | // We can just leave the coefficients in registers in this case |
||
473 | ldrd CO0, CO1, [PCO] |
||
474 | .endif |
||
475 | 1: |
||
476 | .if \maxchan == 1 |
||
477 | ldrd SA0, SA1, [PSA] |
||
478 | smull AC0, AC1, CO0, SA0 |
||
479 | .elseif \maxchan == 5 |
||
480 | ldr CO0, [PCO, #0] |
||
481 | ldr SA0, [PSA, #0] |
||
482 | ldr CO1, [PCO, #4] |
||
483 | ldr SA1, [PSA, #4] |
||
484 | ldrd CO2, CO3, [PCO, #8] |
||
485 | smull AC0, AC1, CO0, SA0 |
||
486 | ldrd SA2, SA3, [PSA, #8] |
||
487 | smlal AC0, AC1, CO1, SA1 |
||
488 | ldrd CO0, CO1, [PCO, #16] |
||
489 | smlal AC0, AC1, CO2, SA2 |
||
490 | ldrd SA0, SA1, [PSA, #16] |
||
491 | smlal AC0, AC1, CO3, SA3 |
||
492 | smlal AC0, AC1, CO0, SA0 |
||
493 | .else // \maxchan == 7 |
||
494 | ldr CO2, [PCO, #0] |
||
495 | ldr SA2, [PSA, #0] |
||
496 | ldr CO3, [PCO, #4] |
||
497 | ldr SA3, [PSA, #4] |
||
498 | ldrd CO0, CO1, [PCO, #8] |
||
499 | smull AC0, AC1, CO2, SA2 |
||
500 | ldrd SA0, SA1, [PSA, #8] |
||
501 | smlal AC0, AC1, CO3, SA3 |
||
502 | ldrd CO2, CO3, [PCO, #16] |
||
503 | smlal AC0, AC1, CO0, SA0 |
||
504 | ldrd SA2, SA3, [PSA, #16] |
||
505 | smlal AC0, AC1, CO1, SA1 |
||
506 | ldrd CO0, CO1, [PCO, #24] |
||
507 | smlal AC0, AC1, CO2, SA2 |
||
508 | ldrd SA0, SA1, [PSA, #24] |
||
509 | smlal AC0, AC1, CO3, SA3 |
||
510 | smlal AC0, AC1, CO0, SA0 |
||
511 | .endif |
||
512 | ldm sp, {NOISE, DCH, MASK} |
||
513 | smlal AC0, AC1, CO1, SA1 |
||
514 | .if \shift != 0 |
||
515 | .if \index_mask == 63 |
||
516 | add NOISE, NOISE, INDEX, lsr #32-6 |
||
517 | ldrb LSB, [PBL], #MAX_CHANNELS |
||
518 | ldrsb NOISE, [NOISE] |
||
519 | add INDEX, INDEX, INDEX, lsl #32-6 |
||
520 | .else // \index_mask == 127 |
||
521 | add NOISE, NOISE, INDEX, lsr #32-7 |
||
522 | ldrb LSB, [PBL], #MAX_CHANNELS |
||
523 | ldrsb NOISE, [NOISE] |
||
524 | add INDEX, INDEX, INDEX, lsl #32-7 |
||
525 | .endif |
||
526 | sub INDEX, INDEX, #1<<7 |
||
527 | adds AC0, AC0, NOISE, lsl #\shift + 7 |
||
528 | adc AC1, AC1, NOISE, asr #31 |
||
529 | .else |
||
530 | ldrb LSB, [PBL], #MAX_CHANNELS |
||
531 | sub INDEX, INDEX, #1<<7 |
||
532 | .endif |
||
533 | add PSA, PSA, #MAX_CHANNELS*4 |
||
534 | mov AC0, AC0, lsr #14 |
||
535 | orr AC0, AC0, AC1, lsl #18 |
||
536 | .if !\mask_minus1 |
||
537 | and AC0, AC0, MASK |
||
538 | .endif |
||
539 | add AC0, AC0, LSB |
||
540 | tst INDEX, #1<<15 |
||
541 | str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA |
||
542 | beq 1b |
||
543 | b 98f |
||
544 | .endm |
||
545 | |||
546 | .macro switch_on_maxchan shift, index_mask, mask_minus1 |
||
547 | cmp v4, #5 |
||
548 | blo 51f |
||
549 | beq 50f |
||
550 | implement_rematrix \shift, \index_mask, \mask_minus1, 7 |
||
551 | 50: implement_rematrix \shift, \index_mask, \mask_minus1, 5 |
||
552 | 51: implement_rematrix \shift, \index_mask, \mask_minus1, 1 |
||
553 | .endm |
||
554 | |||
555 | .macro switch_on_mask shift, index_mask |
||
556 | cmp sl, #-1 |
||
557 | bne 40f |
||
558 | switch_on_maxchan \shift, \index_mask, 1 |
||
559 | 40: switch_on_maxchan \shift, \index_mask, 0 |
||
560 | .endm |
||
561 | |||
562 | .macro switch_on_au_size shift |
||
563 | .if \shift == 0 |
||
564 | switch_on_mask \shift, undefined |
||
565 | .else |
||
566 | teq v6, #64 |
||
567 | bne 30f |
||
568 | orr INDEX, INDEX, v1, lsl #32-6 |
||
569 | switch_on_mask \shift, 63 |
||
570 | 30: orr INDEX, INDEX, v1, lsl #32-7 |
||
571 | switch_on_mask \shift, 127 |
||
572 | .endif |
||
573 | .endm |
||
574 | |||
575 | /* void ff_mlp_rematrix_channel_arm(int32_t *samples, |
||
576 | * const int32_t *coeffs, |
||
577 | * const uint8_t *bypassed_lsbs, |
||
578 | * const int8_t *noise_buffer, |
||
579 | * int index, |
||
580 | * unsigned int dest_ch, |
||
581 | * uint16_t blockpos, |
||
582 | * unsigned int maxchan, |
||
583 | * int matrix_noise_shift, |
||
584 | * int access_unit_size_pow2, |
||
585 | * int32_t mask); |
||
586 | */ |
||
587 | function ff_mlp_rematrix_channel_arm, export=1 |
||
588 | push {v1-fp,lr} |
||
589 | add v1, sp, #9*4 // point at arguments on stack |
||
590 | ldm v1, {v1-sl} |
||
591 | teq v4, #1 |
||
592 | itt ne |
||
593 | teqne v4, #5 |
||
594 | teqne v4, #7 |
||
595 | bne 99f |
||
596 | teq v6, #64 |
||
597 | it ne |
||
598 | teqne v6, #128 |
||
599 | bne 99f |
||
600 | sub v2, v2, #MAX_CHANNELS |
||
601 | push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned |
||
602 | movs INDEX, v3, lsl #7 |
||
603 | beq 98f // just in case, do nothing if blockpos = 0 |
||
604 | subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time |
||
605 | adc lr, v1, v1 // calculate index2 (C was set by preceding subs) |
||
606 | orr INDEX, INDEX, lr |
||
607 | // Switch on matrix_noise_shift: values 0 and 1 are |
||
608 | // disproportionately common so do those in a form the branch |
||
609 | // predictor can accelerate. Values can only go up to 15. |
||
610 | cmp v5, #1 |
||
611 | beq 11f |
||
612 | blo 10f |
||
613 | A ldr v5, [pc, v5, lsl #2] |
||
614 | A add pc, pc, v5 |
||
615 | T tbh [pc, v5, lsl #1] |
||
616 | 0: |
||
617 | branch_pic_label 0, 0, (12f - 0b), (13f - 0b) |
||
618 | branch_pic_label (14f - 0b), (15f - 0b), (16f - 0b), (17f - 0b) |
||
619 | branch_pic_label (18f - 0b), (19f - 0b), (20f - 0b), (21f - 0b) |
||
620 | branch_pic_label (22f - 0b), (23f - 0b), (24f - 0b), (25f - 0b) |
||
621 | 10: switch_on_au_size 0 |
||
622 | 11: switch_on_au_size 1 |
||
623 | 12: switch_on_au_size 2 |
||
624 | 13: switch_on_au_size 3 |
||
625 | 14: switch_on_au_size 4 |
||
626 | 15: switch_on_au_size 5 |
||
627 | 16: switch_on_au_size 6 |
||
628 | 17: switch_on_au_size 7 |
||
629 | 18: switch_on_au_size 8 |
||
630 | 19: switch_on_au_size 9 |
||
631 | 20: switch_on_au_size 10 |
||
632 | 21: switch_on_au_size 11 |
||
633 | 22: switch_on_au_size 12 |
||
634 | 23: switch_on_au_size 13 |
||
635 | 24: switch_on_au_size 14 |
||
636 | 25: switch_on_au_size 15 |
||
637 | |||
638 | 98: add sp, sp, #3*4 |
||
639 | pop {v1-fp,pc} |
||
640 | 99: // Can't handle these parameters, drop back to C |
||
641 | pop {v1-fp,lr} |
||
642 | b X(ff_mlp_rematrix_channel) |
||
643 | endfunc |
||
644 | |||
645 | .unreq PSA |
||
646 | .unreq PCO |
||
647 | .unreq PBL |
||
648 | .unreq INDEX |
||
649 | .unreq CO0 |
||
650 | .unreq CO1 |
||
651 | .unreq CO2 |
||
652 | .unreq CO3 |
||
653 | .unreq SA0 |
||
654 | .unreq SA1 |
||
655 | .unreq SA2 |
||
656 | .unreq SA3 |
||
657 | .unreq AC0 |
||
658 | .unreq AC1 |
||
659 | .unreq NOISE |
||
660 | .unreq LSB |
||
661 | .unreq DCH |
||
662 | .unreq MASK7><7>15 |