Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright (c) 2014 RISC OS Open Ltd
  3.  * Author: Ben Avison <bavison@riscosopen.org>
  4.  *
  5.  * This file is part of FFmpeg.
  6.  *
  7.  * FFmpeg is free software; you can redistribute it and/or
  8.  * modify it under the terms of the GNU Lesser General Public
  9.  * License as published by the Free Software Foundation; either
  10.  * version 2.1 of the License, or (at your option) any later version.
  11.  *
  12.  * FFmpeg is distributed in the hope that it will be useful,
  13.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15.  * Lesser General Public License for more details.
  16.  *
  17.  * You should have received a copy of the GNU Lesser General Public
  18.  * License along with FFmpeg; if not, write to the Free Software
  19.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20.  */
  21.  
  22. #include "libavutil/arm/asm.S"
  23.  
  24. #define MAX_CHANNELS        8
  25. #define MAX_FIR_ORDER       8
  26. #define MAX_IIR_ORDER       4
  27. #define MAX_RATEFACTOR      4
  28. #define MAX_BLOCKSIZE       (40 * MAX_RATEFACTOR)
  29.  
  30. PST     .req    a1
  31. PCO     .req    a2
  32. AC0     .req    a3
  33. AC1     .req    a4
  34. CO0     .req    v1
  35. CO1     .req    v2
  36. CO2     .req    v3
  37. CO3     .req    v4
  38. ST0     .req    v5
  39. ST1     .req    v6
  40. ST2     .req    sl
  41. ST3     .req    fp
  42. I       .req    ip
  43. PSAMP   .req    lr
  44.  
  45.  
  46. .macro branch_pic_label first, remainder:vararg
  47. A       .word           \first   - 4
  48. T       .hword          (\first) / 2
  49. .ifnb   \remainder
  50.         branch_pic_label \remainder
  51. .endif
  52. .endm
  53.  
  54. // Some macros that do loads/multiplies where the register number is determined
  55. // from an assembly-time expression. Boy is GNU assembler's syntax ugly...
  56.  
  57. .macro load  group, index, base, offset
  58.        .altmacro
  59.        load_ \group, %(\index), \base, \offset
  60.        .noaltmacro
  61. .endm
  62.  
  63. .macro load_ group, index, base, offset
  64.         ldr     \group\index, [\base, #\offset]
  65. .endm
  66.  
  67. .macro loadd  group, index, base, offset
  68.        .altmacro
  69.        loadd_ \group, %(\index), %(\index+1), \base, \offset
  70.        .noaltmacro
  71. .endm
  72.  
  73. .macro loadd_ group, index0, index1, base, offset
  74. A .if \offset >= 256
  75. A       ldr     \group\index0, [\base, #\offset]
  76. A       ldr     \group\index1, [\base, #(\offset) + 4]
  77. A .else
  78.         ldrd    \group\index0, \group\index1, [\base, #\offset]
  79. A .endif
  80. .endm
  81.  
  82. .macro multiply  index, accumulate, long
  83.         .altmacro
  84.         multiply_ %(\index), \accumulate, \long
  85.         .noaltmacro
  86. .endm
  87.  
  88. .macro multiply_  index, accumulate, long
  89.  .if \long
  90.   .if \accumulate
  91.         smlal   AC0, AC1, CO\index, ST\index
  92.   .else
  93.         smull   AC0, AC1, CO\index, ST\index
  94.   .endif
  95.  .else
  96.   .if \accumulate
  97.         mla     AC0, CO\index, ST\index, AC0
  98.   .else
  99.         mul     AC0, CO\index, ST\index
  100.   .endif
  101.  .endif
  102. .endm
  103.  
  104. // A macro to update the load register number and load offsets
  105.  
  106. .macro inc  howmany
  107.   .set LOAD_REG, (LOAD_REG + \howmany) & 3
  108.   .set OFFSET_CO, OFFSET_CO + 4 * \howmany
  109.   .set OFFSET_ST, OFFSET_ST + 4 * \howmany
  110.   .if FIR_REMAIN > 0
  111.     .set FIR_REMAIN, FIR_REMAIN - \howmany
  112.     .if FIR_REMAIN == 0
  113.       .set OFFSET_CO, 4 * MAX_FIR_ORDER
  114.       .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
  115.     .endif
  116.   .elseif IIR_REMAIN > 0
  117.     .set IIR_REMAIN, IIR_REMAIN - \howmany
  118.   .endif
  119. .endm
  120.  
  121. // Macro to implement the inner loop for one specific combination of parameters
  122.  
  123. .macro implement_filter  mask_minus1, shift_0, shift_8, iir_taps, fir_taps
  124.   .set TOTAL_TAPS, \iir_taps + \fir_taps
  125.  
  126.   // Deal with register allocation...
  127.   .set DEFINED_SHIFT, 0
  128.   .set DEFINED_MASK, 0
  129.   .set SHUFFLE_SHIFT, 0
  130.   .set SHUFFLE_MASK, 0
  131.   .set SPILL_SHIFT, 0
  132.   .set SPILL_MASK, 0
  133.   .if TOTAL_TAPS == 0
  134.     // Little register pressure in this case - just keep MASK where it was
  135.     .if !\mask_minus1
  136.       MASK .req ST1
  137.       .set DEFINED_MASK, 1
  138.     .endif
  139.   .else
  140.     .if \shift_0
  141.       .if !\mask_minus1
  142.         // AC1 is unused with shift 0
  143.         MASK .req AC1
  144.         .set DEFINED_MASK, 1
  145.         .set SHUFFLE_MASK, 1
  146.       .endif
  147.     .elseif \shift_8
  148.       .if !\mask_minus1
  149.         .if TOTAL_TAPS <= 4
  150.         // All coefficients are preloaded (so pointer not needed)
  151.           MASK .req PCO
  152.           .set DEFINED_MASK, 1
  153.           .set SHUFFLE_MASK, 1
  154.         .else
  155.           .set SPILL_MASK, 1
  156.         .endif
  157.       .endif
  158.     .else // shift not 0 or 8
  159.       .if TOTAL_TAPS <= 3
  160.         // All coefficients are preloaded, and at least one CO register is unused
  161.         .if \fir_taps & 1
  162.           SHIFT .req CO0
  163.           .set DEFINED_SHIFT, 1
  164.           .set SHUFFLE_SHIFT, 1
  165.         .else
  166.           SHIFT .req CO3
  167.           .set DEFINED_SHIFT, 1
  168.           .set SHUFFLE_SHIFT, 1
  169.         .endif
  170.         .if !\mask_minus1
  171.           MASK .req PCO
  172.           .set DEFINED_MASK, 1
  173.           .set SHUFFLE_MASK, 1
  174.         .endif
  175.       .elseif TOTAL_TAPS == 4
  176.         // All coefficients are preloaded
  177.         SHIFT .req PCO
  178.         .set DEFINED_SHIFT, 1
  179.         .set SHUFFLE_SHIFT, 1
  180.         .if !\mask_minus1
  181.           .set SPILL_MASK, 1
  182.         .endif
  183.       .else
  184.         .set SPILL_SHIFT, 1
  185.         .if !\mask_minus1
  186.           .set SPILL_MASK, 1
  187.         .endif
  188.       .endif
  189.     .endif
  190.   .endif
  191.   .if SPILL_SHIFT
  192.     SHIFT .req ST0
  193.     .set DEFINED_SHIFT, 1
  194.   .endif
  195.   .if SPILL_MASK
  196.     MASK .req ST1
  197.     .set DEFINED_MASK, 1
  198.   .endif
  199.  
  200.         // Preload coefficients if possible
  201.   .if TOTAL_TAPS <= 4
  202.     .set OFFSET_CO, 0
  203.     .if \fir_taps & 1
  204.       .set LOAD_REG, 1
  205.     .else
  206.       .set LOAD_REG, 0
  207.     .endif
  208.     .rept \fir_taps
  209.         load    CO, LOAD_REG, PCO, OFFSET_CO
  210.       .set LOAD_REG, (LOAD_REG + 1) & 3
  211.       .set OFFSET_CO, OFFSET_CO + 4
  212.     .endr
  213.     .set OFFSET_CO, 4 * MAX_FIR_ORDER
  214.     .rept \iir_taps
  215.         load    CO, LOAD_REG, PCO, OFFSET_CO
  216.       .set LOAD_REG, (LOAD_REG + 1) & 3
  217.       .set OFFSET_CO, OFFSET_CO + 4
  218.     .endr
  219.   .endif
  220.  
  221.         // Move mask/shift to final positions if necessary
  222.         // Need to do this after preloading, because in some cases we
  223.         // reuse the coefficient pointer register
  224.   .if SHUFFLE_SHIFT
  225.         mov     SHIFT, ST0
  226.   .endif
  227.   .if SHUFFLE_MASK
  228.         mov     MASK, ST1
  229.   .endif
  230.  
  231.         // Begin loop
  232. 01:
  233.   .if TOTAL_TAPS == 0
  234.         // Things simplify a lot in this case
  235.         // In fact this could be pipelined further if it's worth it...
  236.         ldr     ST0, [PSAMP]
  237.         subs    I, I, #1
  238.     .if !\mask_minus1
  239.         and     ST0, ST0, MASK
  240.     .endif
  241.         str     ST0, [PST, #-4]!
  242.         str     ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
  243.         str     ST0, [PSAMP], #4 * MAX_CHANNELS
  244.         bne     01b
  245.   .else
  246.     .if \fir_taps & 1
  247.       .set LOAD_REG, 1
  248.     .else
  249.       .set LOAD_REG, 0
  250.     .endif
  251.     .set LOAD_BANK, 0
  252.     .set FIR_REMAIN, \fir_taps
  253.     .set IIR_REMAIN, \iir_taps
  254.     .if FIR_REMAIN == 0 // only IIR terms
  255.       .set OFFSET_CO, 4 * MAX_FIR_ORDER
  256.       .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
  257.     .else
  258.       .set OFFSET_CO, 0
  259.       .set OFFSET_ST, 0
  260.     .endif
  261.     .set MUL_REG, LOAD_REG
  262.     .set COUNTER, 0
  263.     .rept TOTAL_TAPS + 2
  264.         // Do load(s)
  265.      .if FIR_REMAIN != 0 || IIR_REMAIN != 0
  266.       .if COUNTER == 0
  267.        .if TOTAL_TAPS > 4
  268.         load    CO, LOAD_REG, PCO, OFFSET_CO
  269.        .endif
  270.         load    ST, LOAD_REG, PST, OFFSET_ST
  271.         inc     1
  272.       .elseif COUNTER == 1 && (\fir_taps & 1) == 0
  273.        .if TOTAL_TAPS > 4
  274.         load    CO, LOAD_REG, PCO, OFFSET_CO
  275.        .endif
  276.         load    ST, LOAD_REG, PST, OFFSET_ST
  277.         inc     1
  278.       .elseif LOAD_BANK == 0
  279.        .if TOTAL_TAPS > 4
  280.         .if FIR_REMAIN == 0 && IIR_REMAIN == 1
  281.         load    CO, LOAD_REG, PCO, OFFSET_CO
  282.         .else
  283.         loadd   CO, LOAD_REG, PCO, OFFSET_CO
  284.         .endif
  285.        .endif
  286.        .set LOAD_BANK, 1
  287.       .else
  288.        .if FIR_REMAIN == 0 && IIR_REMAIN == 1
  289.         load    ST, LOAD_REG, PST, OFFSET_ST
  290.         inc     1
  291.        .else
  292.         loadd   ST, LOAD_REG, PST, OFFSET_ST
  293.         inc     2
  294.        .endif
  295.        .set LOAD_BANK, 0
  296.       .endif
  297.      .endif
  298.  
  299.         // Do interleaved multiplies, slightly delayed
  300.      .if COUNTER >= 2
  301.         multiply MUL_REG, COUNTER > 2, !\shift_0
  302.       .set MUL_REG, (MUL_REG + 1) & 3
  303.      .endif
  304.      .set COUNTER, COUNTER + 1
  305.     .endr
  306.  
  307.         // Post-process the result of the multiplies
  308.     .if SPILL_SHIFT
  309.         ldr     SHIFT, [sp, #9*4 + 0*4]
  310.     .endif
  311.     .if SPILL_MASK
  312.         ldr     MASK, [sp, #9*4 + 1*4]
  313.     .endif
  314.         ldr     ST2, [PSAMP]
  315.         subs    I, I, #1
  316.     .if \shift_8
  317.         mov     AC0, AC0, lsr #8
  318.         orr     AC0, AC0, AC1, lsl #24
  319.     .elseif !\shift_0
  320.         rsb     ST3, SHIFT, #32
  321.         mov     AC0, AC0, lsr SHIFT
  322. A       orr     AC0, AC0, AC1, lsl ST3
  323. T       mov     AC1, AC1, lsl ST3
  324. T       orr     AC0, AC0, AC1
  325.     .endif
  326.     .if \mask_minus1
  327.         add     ST3, ST2, AC0
  328.     .else
  329.         add     ST2, ST2, AC0
  330.         and     ST3, ST2, MASK
  331.         sub     ST2, ST3, AC0
  332.     .endif
  333.         str     ST3, [PST, #-4]!
  334.         str     ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
  335.         str     ST3, [PSAMP], #4 * MAX_CHANNELS
  336.         bne     01b
  337.   .endif
  338.         b       99f
  339.  
  340.   .if DEFINED_SHIFT
  341.     .unreq SHIFT
  342.   .endif
  343.   .if DEFINED_MASK
  344.     .unreq MASK
  345.   .endif
  346. .endm
  347.  
  348. .macro switch_on_fir_taps  mask_minus1, shift_0, shift_8, iir_taps
  349. A       ldr     CO0, [pc, a3, lsl #2]   // firorder is in range 0-(8-iir_taps)
  350. A       add     pc,  pc,  CO0
  351. T       tbh     [pc, a3, lsl #1]
  352. 0:
  353.         branch_pic_label (70f - 0b), (71f - 0b), (72f - 0b), (73f - 0b)
  354.         branch_pic_label (74f - 0b)
  355.  .if \iir_taps <= 3
  356.         branch_pic_label (75f - 0b)
  357.   .if \iir_taps <= 2
  358.         branch_pic_label (76f - 0b)
  359.    .if \iir_taps <= 1
  360.         branch_pic_label (77f - 0b)
  361.     .if \iir_taps == 0
  362.         branch_pic_label (78f - 0b)
  363.     .endif
  364.    .endif
  365.   .endif
  366.  .endif
  367. 70:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
  368. 71:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
  369. 72:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
  370. 73:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
  371. 74:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
  372.  .if \iir_taps <= 3
  373. 75:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
  374.   .if \iir_taps <= 2
  375. 76:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
  376.    .if \iir_taps <= 1
  377. 77:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
  378.     .if \iir_taps == 0
  379. 78:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
  380.     .endif
  381.    .endif
  382.   .endif
  383.  .endif
  384. .endm
  385.  
  386. .macro switch_on_iir_taps  mask_minus1, shift_0, shift_8
  387. A       ldr     CO0, [pc, a4, lsl #2]   // irorder is in range 0-4
  388. A       add     pc,  pc,  CO0
  389. T       tbh     [pc, a4, lsl #1]
  390. 0:
  391.         branch_pic_label (60f - 0b), (61f - 0b), (62f - 0b), (63f - 0b)
  392.         branch_pic_label (64f - 0b)
  393. 60:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 0
  394. 61:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 1
  395. 62:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 2
  396. 63:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 3
  397. 64:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 4
  398. .endm
  399.  
  400. /* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
  401.  *                                int firorder, int iirorder,
  402.  *                                unsigned int filter_shift, int32_t mask,
  403.  *                                int blocksize, int32_t *sample_buffer);
  404.  */
  405. function ff_mlp_filter_channel_arm, export=1
  406.         push    {v1-fp,lr}
  407.         add     v1, sp, #9*4 // point at arguments on stack
  408.         ldm     v1, {ST0,ST1,I,PSAMP}
  409.         cmp     ST1, #-1
  410.         bne     30f
  411.         movs    ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
  412.         bne     20f
  413.         bcs     10f
  414.         switch_on_iir_taps 1, 1, 0
  415. 10:     switch_on_iir_taps 1, 0, 1
  416. 20:     switch_on_iir_taps 1, 0, 0
  417. 30:     movs    ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
  418.         bne     50f
  419.         bcs     40f
  420.         switch_on_iir_taps 0, 1, 0
  421. 40:     switch_on_iir_taps 0, 0, 1
  422. 50:     switch_on_iir_taps 0, 0, 0
  423. 99:     pop     {v1-fp,pc}
  424. endfunc
  425.  
  426.         .unreq  PST
  427.         .unreq  PCO
  428.         .unreq  AC0
  429.         .unreq  AC1
  430.         .unreq  CO0
  431.         .unreq  CO1
  432.         .unreq  CO2
  433.         .unreq  CO3
  434.         .unreq  ST0
  435.         .unreq  ST1
  436.         .unreq  ST2
  437.         .unreq  ST3
  438.         .unreq  I
  439.         .unreq  PSAMP
  440.  
  441. /********************************************************************/
  442.  
  443. PSA     .req    a1 // samples
  444. PCO     .req    a2 // coeffs
  445. PBL     .req    a3 // bypassed_lsbs
  446. INDEX   .req    a4
  447. CO0     .req    v1
  448. CO1     .req    v2
  449. CO2     .req    v3
  450. CO3     .req    v4
  451. SA0     .req    v5
  452. SA1     .req    v6
  453. SA2     .req    sl
  454. SA3     .req    fp
  455. AC0     .req    ip
  456. AC1     .req    lr
  457. NOISE   .req    SA0
  458. LSB     .req    SA1
  459. DCH     .req    SA2 // dest_ch
  460. MASK    .req    SA3
  461.  
  462.     // INDEX is used as follows:
  463.     // bits 0..6   index2 (values up to 17, but wider so that we can
  464.     //               add to index field without needing to mask)
  465.     // bits 7..14  i (values up to 160)
  466.     // bit 15      underflow detect for i
  467.     // bits 25..31 (if access_unit_size_pow2 == 128)  \ index
  468.     // bits 26..31 (if access_unit_size_pow2 == 64)   /
  469.  
  470. .macro implement_rematrix  shift, index_mask, mask_minus1, maxchan
  471.     .if \maxchan == 1
  472.         // We can just leave the coefficients in registers in this case
  473.         ldrd    CO0, CO1, [PCO]
  474.     .endif
  475. 1:
  476.     .if \maxchan == 1
  477.         ldrd    SA0, SA1, [PSA]
  478.         smull   AC0, AC1, CO0, SA0
  479.     .elseif \maxchan == 5
  480.         ldr     CO0, [PCO, #0]
  481.         ldr     SA0, [PSA, #0]
  482.         ldr     CO1, [PCO, #4]
  483.         ldr     SA1, [PSA, #4]
  484.         ldrd    CO2, CO3, [PCO, #8]
  485.         smull   AC0, AC1, CO0, SA0
  486.         ldrd    SA2, SA3, [PSA, #8]
  487.         smlal   AC0, AC1, CO1, SA1
  488.         ldrd    CO0, CO1, [PCO, #16]
  489.         smlal   AC0, AC1, CO2, SA2
  490.         ldrd    SA0, SA1, [PSA, #16]
  491.         smlal   AC0, AC1, CO3, SA3
  492.         smlal   AC0, AC1, CO0, SA0
  493.     .else // \maxchan == 7
  494.         ldr     CO2, [PCO, #0]
  495.         ldr     SA2, [PSA, #0]
  496.         ldr     CO3, [PCO, #4]
  497.         ldr     SA3, [PSA, #4]
  498.         ldrd    CO0, CO1, [PCO, #8]
  499.         smull   AC0, AC1, CO2, SA2
  500.         ldrd    SA0, SA1, [PSA, #8]
  501.         smlal   AC0, AC1, CO3, SA3
  502.         ldrd    CO2, CO3, [PCO, #16]
  503.         smlal   AC0, AC1, CO0, SA0
  504.         ldrd    SA2, SA3, [PSA, #16]
  505.         smlal   AC0, AC1, CO1, SA1
  506.         ldrd    CO0, CO1, [PCO, #24]
  507.         smlal   AC0, AC1, CO2, SA2
  508.         ldrd    SA0, SA1, [PSA, #24]
  509.         smlal   AC0, AC1, CO3, SA3
  510.         smlal   AC0, AC1, CO0, SA0
  511.     .endif
  512.         ldm     sp, {NOISE, DCH, MASK}
  513.         smlal   AC0, AC1, CO1, SA1
  514.     .if \shift != 0
  515.       .if \index_mask == 63
  516.         add     NOISE, NOISE, INDEX, lsr #32-6
  517.         ldrb    LSB, [PBL], #MAX_CHANNELS
  518.         ldrsb   NOISE, [NOISE]
  519.         add     INDEX, INDEX, INDEX, lsl #32-6
  520.       .else // \index_mask == 127
  521.         add     NOISE, NOISE, INDEX, lsr #32-7
  522.         ldrb    LSB, [PBL], #MAX_CHANNELS
  523.         ldrsb   NOISE, [NOISE]
  524.         add     INDEX, INDEX, INDEX, lsl #32-7
  525.       .endif
  526.         sub     INDEX, INDEX, #1<<7
  527.         adds    AC0, AC0, NOISE, lsl #\shift + 7
  528.         adc     AC1, AC1, NOISE, asr #31
  529.     .else
  530.         ldrb    LSB, [PBL], #MAX_CHANNELS
  531.         sub     INDEX, INDEX, #1<<7
  532.     .endif
  533.         add     PSA, PSA, #MAX_CHANNELS*4
  534.         mov     AC0, AC0, lsr #14
  535.         orr     AC0, AC0, AC1, lsl #18
  536.     .if !\mask_minus1
  537.         and     AC0, AC0, MASK
  538.     .endif
  539.         add     AC0, AC0, LSB
  540.         tst     INDEX, #1<<15
  541.         str     AC0, [PSA, DCH, lsl #2]  // DCH is precompensated for the early increment of PSA
  542.         beq     1b
  543.         b       98f
  544. .endm
  545.  
  546. .macro switch_on_maxchan  shift, index_mask, mask_minus1
  547.         cmp     v4, #5
  548.         blo     51f
  549.         beq     50f
  550.         implement_rematrix  \shift, \index_mask, \mask_minus1, 7
  551. 50:     implement_rematrix  \shift, \index_mask, \mask_minus1, 5
  552. 51:     implement_rematrix  \shift, \index_mask, \mask_minus1, 1
  553. .endm
  554.  
  555. .macro switch_on_mask  shift, index_mask
  556.         cmp     sl, #-1
  557.         bne     40f
  558.         switch_on_maxchan  \shift, \index_mask, 1
  559. 40:     switch_on_maxchan  \shift, \index_mask, 0
  560. .endm
  561.  
  562. .macro switch_on_au_size  shift
  563.   .if \shift == 0
  564.         switch_on_mask  \shift, undefined
  565.   .else
  566.         teq     v6, #64
  567.         bne     30f
  568.         orr     INDEX, INDEX, v1, lsl #32-6
  569.         switch_on_mask  \shift, 63
  570. 30:     orr     INDEX, INDEX, v1, lsl #32-7
  571.         switch_on_mask  \shift, 127
  572.   .endif
  573. .endm
  574.  
  575. /* void ff_mlp_rematrix_channel_arm(int32_t *samples,
  576.  *                                  const int32_t *coeffs,
  577.  *                                  const uint8_t *bypassed_lsbs,
  578.  *                                  const int8_t *noise_buffer,
  579.  *                                  int index,
  580.  *                                  unsigned int dest_ch,
  581.  *                                  uint16_t blockpos,
  582.  *                                  unsigned int maxchan,
  583.  *                                  int matrix_noise_shift,
  584.  *                                  int access_unit_size_pow2,
  585.  *                                  int32_t mask);
  586.  */
  587. function ff_mlp_rematrix_channel_arm, export=1
  588.         push    {v1-fp,lr}
  589.         add     v1, sp, #9*4 // point at arguments on stack
  590.         ldm     v1, {v1-sl}
  591.         teq     v4, #1
  592.         itt     ne
  593.         teqne   v4, #5
  594.         teqne   v4, #7
  595.         bne     99f
  596.         teq     v6, #64
  597.         it      ne
  598.         teqne   v6, #128
  599.         bne     99f
  600.         sub     v2, v2, #MAX_CHANNELS
  601.         push    {a4,v2,sl}          // initialise NOISE,DCH,MASK; make sp dword-aligned
  602.         movs    INDEX, v3, lsl #7
  603.         beq     98f                 // just in case, do nothing if blockpos = 0
  604.         subs    INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
  605.         adc     lr, v1, v1          // calculate index2 (C was set by preceding subs)
  606.         orr     INDEX, INDEX, lr
  607.         // Switch on matrix_noise_shift: values 0 and 1 are
  608.         // disproportionately common so do those in a form the branch
  609.         // predictor can accelerate. Values can only go up to 15.
  610.         cmp     v5, #1
  611.         beq     11f
  612.         blo     10f
  613. A       ldr     v5,  [pc,  v5,  lsl #2]
  614. A       add     pc,  pc,  v5
  615. T       tbh     [pc, v5, lsl #1]
  616. 0:
  617.         branch_pic_label          0,          0, (12f - 0b), (13f - 0b)
  618.         branch_pic_label (14f - 0b), (15f - 0b), (16f - 0b), (17f - 0b)
  619.         branch_pic_label (18f - 0b), (19f - 0b), (20f - 0b), (21f - 0b)
  620.         branch_pic_label (22f - 0b), (23f - 0b), (24f - 0b), (25f - 0b)
  621. 10:     switch_on_au_size  0
  622. 11:     switch_on_au_size  1
  623. 12:     switch_on_au_size  2
  624. 13:     switch_on_au_size  3
  625. 14:     switch_on_au_size  4
  626. 15:     switch_on_au_size  5
  627. 16:     switch_on_au_size  6
  628. 17:     switch_on_au_size  7
  629. 18:     switch_on_au_size  8
  630. 19:     switch_on_au_size  9
  631. 20:     switch_on_au_size  10
  632. 21:     switch_on_au_size  11
  633. 22:     switch_on_au_size  12
  634. 23:     switch_on_au_size  13
  635. 24:     switch_on_au_size  14
  636. 25:     switch_on_au_size  15
  637.  
  638. 98:     add     sp, sp, #3*4
  639.         pop     {v1-fp,pc}
  640. 99:     // Can't handle these parameters, drop back to C
  641.         pop     {v1-fp,lr}
  642.         b       X(ff_mlp_rematrix_channel)
  643. endfunc
  644.  
  645.         .unreq  PSA
  646.         .unreq  PCO
  647.         .unreq  PBL
  648.         .unreq  INDEX
  649.         .unreq  CO0
  650.         .unreq  CO1
  651.         .unreq  CO2
  652.         .unreq  CO3
  653.         .unreq  SA0
  654.         .unreq  SA1
  655.         .unreq  SA2
  656.         .unreq  SA3
  657.         .unreq  AC0
  658.         .unreq  AC1
  659.         .unreq  NOISE
  660.         .unreq  LSB
  661.         .unreq  DCH
  662.         .unreq  MASK
  663.