Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* |
2 | * Copyright (c) 2003 Michael Niedermayer |
||
3 | * |
||
4 | * This file is part of FFmpeg. |
||
5 | * |
||
6 | * FFmpeg is free software; you can redistribute it and/or modify |
||
7 | * it under the terms of the GNU General Public License as published by |
||
8 | * the Free Software Foundation; either version 2 of the License, or |
||
9 | * (at your option) any later version. |
||
10 | * |
||
11 | * FFmpeg is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
14 | * GNU General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU General Public License along |
||
17 | * with FFmpeg; if not, write to the Free Software Foundation, Inc., |
||
18 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
||
19 | */ |
||
20 | |||
21 | |||
22 | #include "libavutil/attributes.h" |
||
23 | #include "libavutil/cpu.h" |
||
24 | #include "libavutil/mem.h" |
||
25 | #include "libavutil/x86/asm.h" |
||
26 | #include "libavfilter/vf_spp.h" |
||
27 | |||
28 | #if HAVE_MMX_INLINE |
||
29 | static void hardthresh_mmx(int16_t dst[64], const int16_t src[64], |
||
30 | int qp, const uint8_t *permutation) |
||
31 | { |
||
32 | int bias = 0; //FIXME |
||
33 | unsigned int threshold1; |
||
34 | |||
35 | threshold1 = qp * ((1<<4) - bias) - 1; |
||
36 | |||
37 | #define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \ |
||
38 | "movq " #src0 ", %%mm0 \n" \ |
||
39 | "movq " #src1 ", %%mm1 \n" \ |
||
40 | "movq " #src2 ", %%mm2 \n" \ |
||
41 | "movq " #src3 ", %%mm3 \n" \ |
||
42 | "psubw %%mm4, %%mm0 \n" \ |
||
43 | "psubw %%mm4, %%mm1 \n" \ |
||
44 | "psubw %%mm4, %%mm2 \n" \ |
||
45 | "psubw %%mm4, %%mm3 \n" \ |
||
46 | "paddusw %%mm5, %%mm0 \n" \ |
||
47 | "paddusw %%mm5, %%mm1 \n" \ |
||
48 | "paddusw %%mm5, %%mm2 \n" \ |
||
49 | "paddusw %%mm5, %%mm3 \n" \ |
||
50 | "paddw %%mm6, %%mm0 \n" \ |
||
51 | "paddw %%mm6, %%mm1 \n" \ |
||
52 | "paddw %%mm6, %%mm2 \n" \ |
||
53 | "paddw %%mm6, %%mm3 \n" \ |
||
54 | "psubusw %%mm6, %%mm0 \n" \ |
||
55 | "psubusw %%mm6, %%mm1 \n" \ |
||
56 | "psubusw %%mm6, %%mm2 \n" \ |
||
57 | "psubusw %%mm6, %%mm3 \n" \ |
||
58 | "psraw $3, %%mm0 \n" \ |
||
59 | "psraw $3, %%mm1 \n" \ |
||
60 | "psraw $3, %%mm2 \n" \ |
||
61 | "psraw $3, %%mm3 \n" \ |
||
62 | \ |
||
63 | "movq %%mm0, %%mm7 \n" \ |
||
64 | "punpcklwd %%mm2, %%mm0 \n" /*A*/ \ |
||
65 | "punpckhwd %%mm2, %%mm7 \n" /*C*/ \ |
||
66 | "movq %%mm1, %%mm2 \n" \ |
||
67 | "punpcklwd %%mm3, %%mm1 \n" /*B*/ \ |
||
68 | "punpckhwd %%mm3, %%mm2 \n" /*D*/ \ |
||
69 | "movq %%mm0, %%mm3 \n" \ |
||
70 | "punpcklwd %%mm1, %%mm0 \n" /*A*/ \ |
||
71 | "punpckhwd %%mm7, %%mm3 \n" /*C*/ \ |
||
72 | "punpcklwd %%mm2, %%mm7 \n" /*B*/ \ |
||
73 | "punpckhwd %%mm2, %%mm1 \n" /*D*/ \ |
||
74 | \ |
||
75 | "movq %%mm0, " #dst0 " \n" \ |
||
76 | "movq %%mm7, " #dst1 " \n" \ |
||
77 | "movq %%mm3, " #dst2 " \n" \ |
||
78 | "movq %%mm1, " #dst3 " \n" |
||
79 | |||
80 | __asm__ volatile( |
||
81 | "movd %2, %%mm4 \n" |
||
82 | "movd %3, %%mm5 \n" |
||
83 | "movd %4, %%mm6 \n" |
||
84 | "packssdw %%mm4, %%mm4 \n" |
||
85 | "packssdw %%mm5, %%mm5 \n" |
||
86 | "packssdw %%mm6, %%mm6 \n" |
||
87 | "packssdw %%mm4, %%mm4 \n" |
||
88 | "packssdw %%mm5, %%mm5 \n" |
||
89 | "packssdw %%mm6, %%mm6 \n" |
||
90 | REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0)) |
||
91 | REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0)) |
||
92 | REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0)) |
||
93 | REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0)) |
||
94 | : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed? |
||
95 | ); |
||
96 | dst[0] = (src[0] + 4) >> 3; |
||
97 | } |
||
98 | |||
99 | static void softthresh_mmx(int16_t dst[64], const int16_t src[64], |
||
100 | int qp, const uint8_t *permutation) |
||
101 | { |
||
102 | int bias = 0; //FIXME |
||
103 | unsigned int threshold1; |
||
104 | |||
105 | threshold1 = qp*((1<<4) - bias) - 1; |
||
106 | |||
107 | #undef REQUANT_CORE |
||
108 | #define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \ |
||
109 | "movq " #src0 ", %%mm0 \n" \ |
||
110 | "movq " #src1 ", %%mm1 \n" \ |
||
111 | "pxor %%mm6, %%mm6 \n" \ |
||
112 | "pxor %%mm7, %%mm7 \n" \ |
||
113 | "pcmpgtw %%mm0, %%mm6 \n" \ |
||
114 | "pcmpgtw %%mm1, %%mm7 \n" \ |
||
115 | "pxor %%mm6, %%mm0 \n" \ |
||
116 | "pxor %%mm7, %%mm1 \n" \ |
||
117 | "psubusw %%mm4, %%mm0 \n" \ |
||
118 | "psubusw %%mm4, %%mm1 \n" \ |
||
119 | "pxor %%mm6, %%mm0 \n" \ |
||
120 | "pxor %%mm7, %%mm1 \n" \ |
||
121 | "movq " #src2 ", %%mm2 \n" \ |
||
122 | "movq " #src3 ", %%mm3 \n" \ |
||
123 | "pxor %%mm6, %%mm6 \n" \ |
||
124 | "pxor %%mm7, %%mm7 \n" \ |
||
125 | "pcmpgtw %%mm2, %%mm6 \n" \ |
||
126 | "pcmpgtw %%mm3, %%mm7 \n" \ |
||
127 | "pxor %%mm6, %%mm2 \n" \ |
||
128 | "pxor %%mm7, %%mm3 \n" \ |
||
129 | "psubusw %%mm4, %%mm2 \n" \ |
||
130 | "psubusw %%mm4, %%mm3 \n" \ |
||
131 | "pxor %%mm6, %%mm2 \n" \ |
||
132 | "pxor %%mm7, %%mm3 \n" \ |
||
133 | \ |
||
134 | "paddsw %%mm5, %%mm0 \n" \ |
||
135 | "paddsw %%mm5, %%mm1 \n" \ |
||
136 | "paddsw %%mm5, %%mm2 \n" \ |
||
137 | "paddsw %%mm5, %%mm3 \n" \ |
||
138 | "psraw $3, %%mm0 \n" \ |
||
139 | "psraw $3, %%mm1 \n" \ |
||
140 | "psraw $3, %%mm2 \n" \ |
||
141 | "psraw $3, %%mm3 \n" \ |
||
142 | \ |
||
143 | "movq %%mm0, %%mm7 \n" \ |
||
144 | "punpcklwd %%mm2, %%mm0 \n" /*A*/ \ |
||
145 | "punpckhwd %%mm2, %%mm7 \n" /*C*/ \ |
||
146 | "movq %%mm1, %%mm2 \n" \ |
||
147 | "punpcklwd %%mm3, %%mm1 \n" /*B*/ \ |
||
148 | "punpckhwd %%mm3, %%mm2 \n" /*D*/ \ |
||
149 | "movq %%mm0, %%mm3 \n" \ |
||
150 | "punpcklwd %%mm1, %%mm0 \n" /*A*/ \ |
||
151 | "punpckhwd %%mm7, %%mm3 \n" /*C*/ \ |
||
152 | "punpcklwd %%mm2, %%mm7 \n" /*B*/ \ |
||
153 | "punpckhwd %%mm2, %%mm1 \n" /*D*/ \ |
||
154 | \ |
||
155 | "movq %%mm0, " #dst0 " \n" \ |
||
156 | "movq %%mm7, " #dst1 " \n" \ |
||
157 | "movq %%mm3, " #dst2 " \n" \ |
||
158 | "movq %%mm1, " #dst3 " \n" |
||
159 | |||
160 | __asm__ volatile( |
||
161 | "movd %2, %%mm4 \n" |
||
162 | "movd %3, %%mm5 \n" |
||
163 | "packssdw %%mm4, %%mm4 \n" |
||
164 | "packssdw %%mm5, %%mm5 \n" |
||
165 | "packssdw %%mm4, %%mm4 \n" |
||
166 | "packssdw %%mm5, %%mm5 \n" |
||
167 | REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0)) |
||
168 | REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0)) |
||
169 | REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0)) |
||
170 | REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0)) |
||
171 | : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed? |
||
172 | ); |
||
173 | |||
174 | dst[0] = (src[0] + 4) >> 3; |
||
175 | } |
||
176 | |||
177 | static void store_slice_mmx(uint8_t *dst, const int16_t *src, |
||
178 | int dst_stride, int src_stride, |
||
179 | int width, int height, int log2_scale, |
||
180 | const uint8_t dither[8][8]) |
||
181 | { |
||
182 | int y; |
||
183 | |||
184 | for (y = 0; y < height; y++) { |
||
185 | uint8_t *dst1 = dst; |
||
186 | const int16_t *src1 = src; |
||
187 | __asm__ volatile( |
||
188 | "movq (%3), %%mm3 \n" |
||
189 | "movq (%3), %%mm4 \n" |
||
190 | "movd %4, %%mm2 \n" |
||
191 | "pxor %%mm0, %%mm0 \n" |
||
192 | "punpcklbw %%mm0, %%mm3 \n" |
||
193 | "punpckhbw %%mm0, %%mm4 \n" |
||
194 | "psraw %%mm2, %%mm3 \n" |
||
195 | "psraw %%mm2, %%mm4 \n" |
||
196 | "movd %5, %%mm2 \n" |
||
197 | "1: \n" |
||
198 | "movq (%0), %%mm0 \n" |
||
199 | "movq 8(%0), %%mm1 \n" |
||
200 | "paddw %%mm3, %%mm0 \n" |
||
201 | "paddw %%mm4, %%mm1 \n" |
||
202 | "psraw %%mm2, %%mm0 \n" |
||
203 | "psraw %%mm2, %%mm1 \n" |
||
204 | "packuswb %%mm1, %%mm0 \n" |
||
205 | "movq %%mm0, (%1) \n" |
||
206 | "add $16, %0 \n" |
||
207 | "add $8, %1 \n" |
||
208 | "cmp %2, %1 \n" |
||
209 | " jb 1b \n" |
||
210 | : "+r" (src1), "+r"(dst1) |
||
211 | : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale) |
||
212 | ); |
||
213 | src += src_stride; |
||
214 | dst += dst_stride; |
||
215 | } |
||
216 | } |
||
217 | |||
218 | #endif /* HAVE_MMX_INLINE */ |
||
219 | |||
220 | av_cold void ff_spp_init_x86(SPPContext *s) |
||
221 | { |
||
222 | #if HAVE_MMX_INLINE |
||
223 | int cpu_flags = av_get_cpu_flags(); |
||
224 | |||
225 | if (cpu_flags & AV_CPU_FLAG_MMX) { |
||
226 | s->store_slice = store_slice_mmx; |
||
227 | switch (s->mode) { |
||
228 | case 0: s->requantize = hardthresh_mmx; break; |
||
229 | case 1: s->requantize = softthresh_mmx; break; |
||
230 | } |
||
231 | } |
||
232 | #endif |
||
233 | }>4)><4)>4)><4)> |