Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* |
2 | * DSP utils mmx functions are compiled twice for rnd/no_rnd |
||
3 | * Copyright (c) 2000, 2001 Fabrice Bellard |
||
4 | * Copyright (c) 2003-2004 Michael Niedermayer |
||
5 | * |
||
6 | * MMX optimization by Nick Kurshev |
||
7 | * mostly rewritten by Michael Niedermayer |
||
8 | * and improved by Zdenek Kabelac |
||
9 | * |
||
10 | * This file is part of FFmpeg. |
||
11 | * |
||
12 | * FFmpeg is free software; you can redistribute it and/or |
||
13 | * modify it under the terms of the GNU Lesser General Public |
||
14 | * License as published by the Free Software Foundation; either |
||
15 | * version 2.1 of the License, or (at your option) any later version. |
||
16 | * |
||
17 | * FFmpeg is distributed in the hope that it will be useful, |
||
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
20 | * Lesser General Public License for more details. |
||
21 | * |
||
22 | * You should have received a copy of the GNU Lesser General Public |
||
23 | * License along with FFmpeg; if not, write to the Free Software |
||
24 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
25 | */ |
||
26 | |||
27 | #include |
||
28 | #include |
||
29 | |||
30 | // put_pixels |
||
31 | STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, |
||
32 | ptrdiff_t line_size, int h) |
||
33 | { |
||
34 | MOVQ_ZERO(mm7); |
||
35 | SET_RND(mm6); // =2 for rnd and =1 for no_rnd version |
||
36 | __asm__ volatile( |
||
37 | "movq (%1), %%mm0 \n\t" |
||
38 | "movq 1(%1), %%mm4 \n\t" |
||
39 | "movq %%mm0, %%mm1 \n\t" |
||
40 | "movq %%mm4, %%mm5 \n\t" |
||
41 | "punpcklbw %%mm7, %%mm0 \n\t" |
||
42 | "punpcklbw %%mm7, %%mm4 \n\t" |
||
43 | "punpckhbw %%mm7, %%mm1 \n\t" |
||
44 | "punpckhbw %%mm7, %%mm5 \n\t" |
||
45 | "paddusw %%mm0, %%mm4 \n\t" |
||
46 | "paddusw %%mm1, %%mm5 \n\t" |
||
47 | "xor %%"REG_a", %%"REG_a" \n\t" |
||
48 | "add %3, %1 \n\t" |
||
49 | ".p2align 3 \n\t" |
||
50 | "1: \n\t" |
||
51 | "movq (%1, %%"REG_a"), %%mm0 \n\t" |
||
52 | "movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
||
53 | "movq %%mm0, %%mm1 \n\t" |
||
54 | "movq %%mm2, %%mm3 \n\t" |
||
55 | "punpcklbw %%mm7, %%mm0 \n\t" |
||
56 | "punpcklbw %%mm7, %%mm2 \n\t" |
||
57 | "punpckhbw %%mm7, %%mm1 \n\t" |
||
58 | "punpckhbw %%mm7, %%mm3 \n\t" |
||
59 | "paddusw %%mm2, %%mm0 \n\t" |
||
60 | "paddusw %%mm3, %%mm1 \n\t" |
||
61 | "paddusw %%mm6, %%mm4 \n\t" |
||
62 | "paddusw %%mm6, %%mm5 \n\t" |
||
63 | "paddusw %%mm0, %%mm4 \n\t" |
||
64 | "paddusw %%mm1, %%mm5 \n\t" |
||
65 | "psrlw $2, %%mm4 \n\t" |
||
66 | "psrlw $2, %%mm5 \n\t" |
||
67 | "packuswb %%mm5, %%mm4 \n\t" |
||
68 | "movq %%mm4, (%2, %%"REG_a") \n\t" |
||
69 | "add %3, %%"REG_a" \n\t" |
||
70 | |||
71 | "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
||
72 | "movq 1(%1, %%"REG_a"), %%mm4 \n\t" |
||
73 | "movq %%mm2, %%mm3 \n\t" |
||
74 | "movq %%mm4, %%mm5 \n\t" |
||
75 | "punpcklbw %%mm7, %%mm2 \n\t" |
||
76 | "punpcklbw %%mm7, %%mm4 \n\t" |
||
77 | "punpckhbw %%mm7, %%mm3 \n\t" |
||
78 | "punpckhbw %%mm7, %%mm5 \n\t" |
||
79 | "paddusw %%mm2, %%mm4 \n\t" |
||
80 | "paddusw %%mm3, %%mm5 \n\t" |
||
81 | "paddusw %%mm6, %%mm0 \n\t" |
||
82 | "paddusw %%mm6, %%mm1 \n\t" |
||
83 | "paddusw %%mm4, %%mm0 \n\t" |
||
84 | "paddusw %%mm5, %%mm1 \n\t" |
||
85 | "psrlw $2, %%mm0 \n\t" |
||
86 | "psrlw $2, %%mm1 \n\t" |
||
87 | "packuswb %%mm1, %%mm0 \n\t" |
||
88 | "movq %%mm0, (%2, %%"REG_a") \n\t" |
||
89 | "add %3, %%"REG_a" \n\t" |
||
90 | |||
91 | "subl $2, %0 \n\t" |
||
92 | "jnz 1b \n\t" |
||
93 | :"+g"(h), "+S"(pixels) |
||
94 | :"D"(block), "r"((x86_reg)line_size) |
||
95 | :REG_a, "memory"); |
||
96 | } |
||
97 | |||
98 | // avg_pixels |
||
99 | // this routine is 'slightly' suboptimal but mostly unused |
||
100 | STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, |
||
101 | ptrdiff_t line_size, int h) |
||
102 | { |
||
103 | MOVQ_ZERO(mm7); |
||
104 | SET_RND(mm6); // =2 for rnd and =1 for no_rnd version |
||
105 | __asm__ volatile( |
||
106 | "movq (%1), %%mm0 \n\t" |
||
107 | "movq 1(%1), %%mm4 \n\t" |
||
108 | "movq %%mm0, %%mm1 \n\t" |
||
109 | "movq %%mm4, %%mm5 \n\t" |
||
110 | "punpcklbw %%mm7, %%mm0 \n\t" |
||
111 | "punpcklbw %%mm7, %%mm4 \n\t" |
||
112 | "punpckhbw %%mm7, %%mm1 \n\t" |
||
113 | "punpckhbw %%mm7, %%mm5 \n\t" |
||
114 | "paddusw %%mm0, %%mm4 \n\t" |
||
115 | "paddusw %%mm1, %%mm5 \n\t" |
||
116 | "xor %%"REG_a", %%"REG_a" \n\t" |
||
117 | "add %3, %1 \n\t" |
||
118 | ".p2align 3 \n\t" |
||
119 | "1: \n\t" |
||
120 | "movq (%1, %%"REG_a"), %%mm0 \n\t" |
||
121 | "movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
||
122 | "movq %%mm0, %%mm1 \n\t" |
||
123 | "movq %%mm2, %%mm3 \n\t" |
||
124 | "punpcklbw %%mm7, %%mm0 \n\t" |
||
125 | "punpcklbw %%mm7, %%mm2 \n\t" |
||
126 | "punpckhbw %%mm7, %%mm1 \n\t" |
||
127 | "punpckhbw %%mm7, %%mm3 \n\t" |
||
128 | "paddusw %%mm2, %%mm0 \n\t" |
||
129 | "paddusw %%mm3, %%mm1 \n\t" |
||
130 | "paddusw %%mm6, %%mm4 \n\t" |
||
131 | "paddusw %%mm6, %%mm5 \n\t" |
||
132 | "paddusw %%mm0, %%mm4 \n\t" |
||
133 | "paddusw %%mm1, %%mm5 \n\t" |
||
134 | "psrlw $2, %%mm4 \n\t" |
||
135 | "psrlw $2, %%mm5 \n\t" |
||
136 | "movq (%2, %%"REG_a"), %%mm3 \n\t" |
||
137 | "packuswb %%mm5, %%mm4 \n\t" |
||
138 | "pcmpeqd %%mm2, %%mm2 \n\t" |
||
139 | "paddb %%mm2, %%mm2 \n\t" |
||
140 | PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2) |
||
141 | "movq %%mm5, (%2, %%"REG_a") \n\t" |
||
142 | "add %3, %%"REG_a" \n\t" |
||
143 | |||
144 | "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
||
145 | "movq 1(%1, %%"REG_a"), %%mm4 \n\t" |
||
146 | "movq %%mm2, %%mm3 \n\t" |
||
147 | "movq %%mm4, %%mm5 \n\t" |
||
148 | "punpcklbw %%mm7, %%mm2 \n\t" |
||
149 | "punpcklbw %%mm7, %%mm4 \n\t" |
||
150 | "punpckhbw %%mm7, %%mm3 \n\t" |
||
151 | "punpckhbw %%mm7, %%mm5 \n\t" |
||
152 | "paddusw %%mm2, %%mm4 \n\t" |
||
153 | "paddusw %%mm3, %%mm5 \n\t" |
||
154 | "paddusw %%mm6, %%mm0 \n\t" |
||
155 | "paddusw %%mm6, %%mm1 \n\t" |
||
156 | "paddusw %%mm4, %%mm0 \n\t" |
||
157 | "paddusw %%mm5, %%mm1 \n\t" |
||
158 | "psrlw $2, %%mm0 \n\t" |
||
159 | "psrlw $2, %%mm1 \n\t" |
||
160 | "movq (%2, %%"REG_a"), %%mm3 \n\t" |
||
161 | "packuswb %%mm1, %%mm0 \n\t" |
||
162 | "pcmpeqd %%mm2, %%mm2 \n\t" |
||
163 | "paddb %%mm2, %%mm2 \n\t" |
||
164 | PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2) |
||
165 | "movq %%mm1, (%2, %%"REG_a") \n\t" |
||
166 | "add %3, %%"REG_a" \n\t" |
||
167 | |||
168 | "subl $2, %0 \n\t" |
||
169 | "jnz 1b \n\t" |
||
170 | :"+g"(h), "+S"(pixels) |
||
171 | :"D"(block), "r"((x86_reg)line_size) |
||
172 | :REG_a, "memory"); |
||
173 | }->->->-> |