Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6147 | serge | 1 | ;****************************************************************************** |
2 | ;* optimized bswap buffer functions |
||
3 | ;* Copyright (c) 2008 Loren Merritt |
||
4 | ;* Copyright (c) 2003-2013 Michael Niedermayer |
||
5 | ;* Copyright (c) 2013 Daniel Kang |
||
6 | ;* |
||
7 | ;* This file is part of FFmpeg. |
||
8 | ;* |
||
9 | ;* FFmpeg is free software; you can redistribute it and/or |
||
10 | ;* modify it under the terms of the GNU Lesser General Public |
||
11 | ;* License as published by the Free Software Foundation; either |
||
12 | ;* version 2.1 of the License, or (at your option) any later version. |
||
13 | ;* |
||
14 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
17 | ;* Lesser General Public License for more details. |
||
18 | ;* |
||
19 | ;* You should have received a copy of the GNU Lesser General Public |
||
20 | ;* License along with FFmpeg; if not, write to the Free Software |
||
21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
22 | ;****************************************************************************** |
||
23 | |||
24 | %include "libavutil/x86/x86util.asm" |
||
25 | |||
26 | SECTION_RODATA |
||
27 | pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 |
||
28 | |||
29 | cextern pb_80 |
||
30 | |||
31 | SECTION .text |
||
32 | |||
33 | ; %1 = aligned/unaligned |
||
34 | %macro BSWAP_LOOPS 1 |
||
35 | mov r3d, r2d |
||
36 | sar r2d, 3 |
||
37 | jz .left4_%1 |
||
38 | .loop8_%1: |
||
39 | mov%1 m0, [r1 + 0] |
||
40 | mov%1 m1, [r1 + 16] |
||
41 | %if cpuflag(ssse3) |
||
42 | pshufb m0, m2 |
||
43 | pshufb m1, m2 |
||
44 | mov%1 [r0 + 0], m0 |
||
45 | mov%1 [r0 + 16], m1 |
||
46 | %else |
||
47 | pshuflw m0, m0, 10110001b |
||
48 | pshuflw m1, m1, 10110001b |
||
49 | pshufhw m0, m0, 10110001b |
||
50 | pshufhw m1, m1, 10110001b |
||
51 | mova m2, m0 |
||
52 | mova m3, m1 |
||
53 | psllw m0, 8 |
||
54 | psllw m1, 8 |
||
55 | psrlw m2, 8 |
||
56 | psrlw m3, 8 |
||
57 | por m2, m0 |
||
58 | por m3, m1 |
||
59 | mov%1 [r0 + 0], m2 |
||
60 | mov%1 [r0 + 16], m3 |
||
61 | %endif |
||
62 | add r0, 32 |
||
63 | add r1, 32 |
||
64 | dec r2d |
||
65 | jnz .loop8_%1 |
||
66 | .left4_%1: |
||
67 | mov r2d, r3d |
||
68 | test r3d, 4 |
||
69 | jz .left |
||
70 | mov%1 m0, [r1] |
||
71 | %if cpuflag(ssse3) |
||
72 | pshufb m0, m2 |
||
73 | mov%1 [r0], m0 |
||
74 | %else |
||
75 | pshuflw m0, m0, 10110001b |
||
76 | pshufhw m0, m0, 10110001b |
||
77 | mova m2, m0 |
||
78 | psllw m0, 8 |
||
79 | psrlw m2, 8 |
||
80 | por m2, m0 |
||
81 | mov%1 [r0], m2 |
||
82 | %endif |
||
83 | add r1, 16 |
||
84 | add r0, 16 |
||
85 | %endmacro |
||
86 | |||
87 | ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w); |
||
88 | %macro BSWAP32_BUF 0 |
||
89 | %if cpuflag(ssse3) |
||
90 | cglobal bswap32_buf, 3,4,3 |
||
91 | mov r3, r1 |
||
92 | mova m2, [pb_bswap32] |
||
93 | %else |
||
94 | cglobal bswap32_buf, 3,4,5 |
||
95 | mov r3, r1 |
||
96 | %endif |
||
97 | or r3, r0 |
||
98 | test r3, 15 |
||
99 | jz .start_align |
||
100 | BSWAP_LOOPS u |
||
101 | jmp .left |
||
102 | .start_align: |
||
103 | BSWAP_LOOPS a |
||
104 | .left: |
||
105 | %if cpuflag(ssse3) |
||
106 | test r2d, 2 |
||
107 | jz .left1 |
||
108 | movq m0, [r1] |
||
109 | pshufb m0, m2 |
||
110 | movq [r0], m0 |
||
111 | add r1, 8 |
||
112 | add r0, 8 |
||
113 | .left1: |
||
114 | test r2d, 1 |
||
115 | jz .end |
||
116 | mov r2d, [r1] |
||
117 | bswap r2d |
||
118 | mov [r0], r2d |
||
119 | %else |
||
120 | and r2d, 3 |
||
121 | jz .end |
||
122 | .loop2: |
||
123 | mov r3d, [r1] |
||
124 | bswap r3d |
||
125 | mov [r0], r3d |
||
126 | add r1, 4 |
||
127 | add r0, 4 |
||
128 | dec r2d |
||
129 | jnz .loop2 |
||
130 | %endif |
||
131 | .end: |
||
132 | RET |
||
133 | %endmacro |
||
134 | |||
135 | INIT_XMM sse2 |
||
136 | BSWAP32_BUF |
||
137 | |||
138 | INIT_XMM ssse3 |
||
139 | BSWAP32_BUF |