Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Mesa 3-D graphics library |
||
3 | * |
||
4 | * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. |
||
5 | * |
||
6 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
7 | * copy of this software and associated documentation files (the "Software"), |
||
8 | * to deal in the Software without restriction, including without limitation |
||
9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||
10 | * and/or sell copies of the Software, and to permit persons to whom the |
||
11 | * Software is furnished to do so, subject to the following conditions: |
||
12 | * |
||
13 | * The above copyright notice and this permission notice shall be included |
||
14 | * in all copies or substantial portions of the Software. |
||
15 | * |
||
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||
17 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR |
||
20 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
||
21 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
||
22 | * OTHER DEALINGS IN THE SOFTWARE. |
||
23 | */ |
||
24 | |||
25 | #ifdef USE_X86_64_ASM |
||
26 | |||
27 | #include "matypes.h" |
||
28 | |||
29 | .text |
||
30 | |||
31 | .align 16 |
||
32 | .globl _mesa_x86_64_cpuid |
||
33 | .hidden _mesa_x86_64_cpuid |
||
34 | _mesa_x86_64_cpuid: |
||
35 | pushq %rbx |
||
36 | movl (%rdi), %eax |
||
37 | movl 8(%rdi), %ecx |
||
38 | |||
39 | cpuid |
||
40 | |||
41 | movl %ebx, 4(%rdi) |
||
42 | movl %eax, (%rdi) |
||
43 | movl %ecx, 8(%rdi) |
||
44 | movl %edx, 12(%rdi) |
||
45 | popq %rbx |
||
46 | ret |
||
47 | |||
48 | .align 16 |
||
49 | .globl _mesa_x86_64_transform_points4_general |
||
50 | .hidden _mesa_x86_64_transform_points4_general |
||
51 | _mesa_x86_64_transform_points4_general: |
||
52 | /* |
||
53 | * rdi = dest |
||
54 | * rsi = matrix |
||
55 | * rdx = source |
||
56 | */ |
||
57 | movl V4F_COUNT(%rdx), %ecx /* count */ |
||
58 | movzbl V4F_STRIDE(%rdx), %eax /* stride */ |
||
59 | |||
60 | movl %ecx, V4F_COUNT(%rdi) /* set dest count */ |
||
61 | movl $4, V4F_SIZE(%rdi) /* set dest size */ |
||
62 | .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */ |
||
63 | orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ |
||
64 | |||
65 | testl %ecx, %ecx /* verify non-zero count */ |
||
66 | prefetchnta 64(%rsi) |
||
67 | jz p4_general_done |
||
68 | |||
69 | movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ |
||
70 | movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ |
||
71 | |||
72 | prefetch 16(%rdx) |
||
73 | |||
74 | movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ |
||
75 | movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ |
||
76 | .byte 0x66, 0x66, 0x90 /* manual align += 3 */ |
||
77 | movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ |
||
78 | movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ |
||
79 | |||
80 | p4_general_loop: |
||
81 | |||
82 | movups (%rdx), %xmm8 /* ox | oy | oz | ow */ |
||
83 | prefetchw 16(%rdi) |
||
84 | |||
85 | pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ |
||
86 | addq %rax, %rdx |
||
87 | pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ |
||
88 | mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ |
||
89 | pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ |
||
90 | mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ |
||
91 | pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ |
||
92 | mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ |
||
93 | addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ |
||
94 | mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ |
||
95 | addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ |
||
96 | prefetch 16(%rdx) |
||
97 | addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ |
||
98 | |||
99 | movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ |
||
100 | addq $16, %rdi |
||
101 | |||
102 | decl %ecx |
||
103 | jnz p4_general_loop |
||
104 | |||
105 | p4_general_done: |
||
106 | .byte 0xf3 |
||
107 | ret |
||
108 | |||
109 | .section .rodata |
||
110 | |||
111 | .align 16 |
||
112 | p4_constants: |
||
113 | .byte 0xff, 0xff, 0xff, 0xff |
||
114 | .byte 0xff, 0xff, 0xff, 0xff |
||
115 | .byte 0xff, 0xff, 0xff, 0xff |
||
116 | .byte 0x00, 0x00, 0x00, 0x00 |
||
117 | |||
118 | .byte 0x00, 0x00, 0x00, 0x00 |
||
119 | .byte 0x00, 0x00, 0x00, 0x00 |
||
120 | .byte 0x00, 0x00, 0x00, 0x00 |
||
121 | .float 1.0 |
||
122 | |||
123 | .text |
||
124 | .align 16 |
||
125 | .globl _mesa_x86_64_transform_points4_3d |
||
126 | .hidden _mesa_x86_64_transform_points4_3d |
||
127 | /* |
||
128 | * this is slower than _mesa_x86_64_transform_points4_general |
||
129 | * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 |
||
130 | */ |
||
131 | _mesa_x86_64_transform_points4_3d: |
||
132 | |||
133 | leaq p4_constants(%rip), %rax |
||
134 | |||
135 | prefetchnta 64(%rsi) |
||
136 | |||
137 | movaps (%rax), %xmm9 |
||
138 | movaps 16(%rax), %xmm10 |
||
139 | |||
140 | movl V4F_COUNT(%rdx), %ecx /* count */ |
||
141 | movzbl V4F_STRIDE(%rdx), %eax /* stride */ |
||
142 | |||
143 | movl %ecx, V4F_COUNT(%rdi) /* set dest count */ |
||
144 | movl $4, V4F_SIZE(%rdi) /* set dest size */ |
||
145 | orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ |
||
146 | |||
147 | testl %ecx, %ecx /* verify non-zero count */ |
||
148 | jz p4_3d_done |
||
149 | |||
150 | movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ |
||
151 | movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ |
||
152 | |||
153 | prefetch 16(%rdx) |
||
154 | |||
155 | movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ |
||
156 | movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ |
||
157 | andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */ |
||
158 | movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ |
||
159 | andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */ |
||
160 | movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ |
||
161 | andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */ |
||
162 | andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */ |
||
163 | .byte 0x66, 0x66, 0x90 /* manual align += 3 */ |
||
164 | orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */ |
||
165 | |||
166 | p4_3d_loop: |
||
167 | |||
168 | movups (%rdx), %xmm8 /* ox | oy | oz | ow */ |
||
169 | prefetchw 16(%rdi) |
||
170 | |||
171 | pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ |
||
172 | addq %rax, %rdx |
||
173 | pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ |
||
174 | mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ |
||
175 | pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ |
||
176 | mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ |
||
177 | pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ |
||
178 | mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ |
||
179 | addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ |
||
180 | mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ |
||
181 | addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ |
||
182 | prefetch 16(%rdx) |
||
183 | addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ |
||
184 | |||
185 | movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ |
||
186 | addq $16, %rdi |
||
187 | |||
188 | dec %ecx |
||
189 | jnz p4_3d_loop |
||
190 | |||
191 | p4_3d_done: |
||
192 | .byte 0xf3 |
||
193 | ret |
||
194 | |||
195 | |||
196 | .align 16 |
||
197 | .globl _mesa_x86_64_transform_points4_identity |
||
198 | .hidden _mesa_x86_64_transform_points4_identity |
||
199 | _mesa_x86_64_transform_points4_identity: |
||
200 | |||
201 | movl V4F_COUNT(%rdx), %ecx /* count */ |
||
202 | movzbl V4F_STRIDE(%rdx), %eax /* stride */ |
||
203 | |||
204 | movl %ecx, V4F_COUNT(%rdi) /* set dest count */ |
||
205 | movl $4, V4F_SIZE(%rdi) /* set dest size */ |
||
206 | orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ |
||
207 | |||
208 | test %ecx, %ecx |
||
209 | jz p4_identity_done |
||
210 | |||
211 | movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ |
||
212 | movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ |
||
213 | prefetch 64(%rsi) |
||
214 | prefetchw 64(%rdi) |
||
215 | |||
216 | add %ecx, %ecx |
||
217 | |||
218 | rep movsq |
||
219 | |||
220 | p4_identity_done: |
||
221 | .byte 0xf3 |
||
222 | ret |
||
223 | |||
224 | |||
225 | .align 16 |
||
226 | .globl _mesa_3dnow_transform_points4_3d_no_rot |
||
227 | .hidden _mesa_3dnow_transform_points4_3d_no_rot |
||
228 | _mesa_3dnow_transform_points4_3d_no_rot: |
||
229 | |||
230 | movl V4F_COUNT(%rdx), %ecx /* count */ |
||
231 | movzbl V4F_STRIDE(%rdx), %eax /* stride */ |
||
232 | |||
233 | movl %ecx, V4F_COUNT(%rdi) /* set dest count */ |
||
234 | movl $4, V4F_SIZE(%rdi) /* set dest size */ |
||
235 | .byte 0x66, 0x66, 0x90 /* manual align += 3 */ |
||
236 | orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ |
||
237 | |||
238 | test %ecx, %ecx |
||
239 | .byte 0x66, 0x66, 0x90 /* manual align += 3 */ |
||
240 | jz p4_3d_no_rot_done |
||
241 | |||
242 | movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ |
||
243 | movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ |
||
244 | |||
245 | prefetch (%rdx) |
||
246 | |||
247 | movd (%rsi), %mm0 /* | m00 */ |
||
248 | .byte 0x66, 0x66, 0x90 /* manual align += 3 */ |
||
249 | punpckldq 20(%rsi), %mm0 /* m11 | m00 */ |
||
250 | |||
251 | movd 40(%rsi), %mm2 /* | m22 */ |
||
252 | movq 48(%rsi), %mm1 /* m31 | m30 */ |
||
253 | |||
254 | punpckldq 56(%rsi), %mm2 /* m11 | m00 */ |
||
255 | |||
256 | p4_3d_no_rot_loop: |
||
257 | |||
258 | prefetchw 32(%rdi) |
||
259 | |||
260 | movq (%rdx), %mm4 /* x1 | x0 */ |
||
261 | movq 8(%rdx), %mm5 /* x3 | x2 */ |
||
262 | movd 12(%rdx), %mm7 /* | x3 */ |
||
263 | |||
264 | movq %mm5, %mm6 /* x3 | x2 */ |
||
265 | pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ |
||
266 | |||
267 | punpckhdq %mm6, %mm6 /* x3 | x3 */ |
||
268 | pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */ |
||
269 | |||
270 | pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ |
||
271 | pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */ |
||
272 | |||
273 | pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ |
||
274 | |||
275 | addq %rax, %rdx |
||
276 | movq %mm4, (%rdi) /* write r0, r1 */ |
||
277 | movq %mm5, 8(%rdi) /* write r2, r3 */ |
||
278 | |||
279 | addq $16, %rdi |
||
280 | |||
281 | decl %ecx |
||
282 | prefetch 32(%rdx) |
||
283 | jnz p4_3d_no_rot_loop |
||
284 | |||
285 | p4_3d_no_rot_done: |
||
286 | femms |
||
287 | ret |
||
288 | |||
289 | |||
290 | .align 16 |
||
291 | .globl _mesa_3dnow_transform_points4_perspective |
||
292 | .hidden _mesa_3dnow_transform_points4_perspective |
||
293 | _mesa_3dnow_transform_points4_perspective: |
||
294 | |||
295 | movl V4F_COUNT(%rdx), %ecx /* count */ |
||
296 | movzbl V4F_STRIDE(%rdx), %eax /* stride */ |
||
297 | |||
298 | movl %ecx, V4F_COUNT(%rdi) /* set dest count */ |
||
299 | movl $4, V4F_SIZE(%rdi) /* set dest size */ |
||
300 | orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ |
||
301 | |||
302 | test %ecx, %ecx |
||
303 | .byte 0x66, 0x66, 0x90 /* manual align += 3 */ |
||
304 | jz p4_perspective_done |
||
305 | |||
306 | movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ |
||
307 | movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ |
||
308 | |||
309 | movd (%rsi), %mm0 /* | m00 */ |
||
310 | pxor %mm7, %mm7 /* 0 | 0 */ |
||
311 | punpckldq 20(%rsi), %mm0 /* m11 | m00 */ |
||
312 | |||
313 | movq 32(%rsi), %mm2 /* m21 | m20 */ |
||
314 | prefetch (%rdx) |
||
315 | |||
316 | movd 40(%rsi), %mm1 /* | m22 */ |
||
317 | |||
318 | .byte 0x66, 0x66, 0x90 /* manual align += 3 */ |
||
319 | punpckldq 56(%rsi), %mm1 /* m32 | m22 */ |
||
320 | |||
321 | |||
322 | p4_perspective_loop: |
||
323 | |||
324 | prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ |
||
325 | |||
326 | movq (%rdx), %mm4 /* x1 | x0 */ |
||
327 | movq 8(%rdx), %mm5 /* x3 | x2 */ |
||
328 | movd 8(%rdx), %mm3 /* | x2 */ |
||
329 | |||
330 | movq %mm5, %mm6 /* x3 | x2 */ |
||
331 | pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ |
||
332 | |||
333 | punpckldq %mm5, %mm5 /* x2 | x2 */ |
||
334 | |||
335 | pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */ |
||
336 | pfsubr %mm7, %mm3 /* | -x2 */ |
||
337 | |||
338 | pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */ |
||
339 | pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */ |
||
340 | |||
341 | pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */ |
||
342 | |||
343 | movq %mm5, (%rdi) /* write r0, r1 */ |
||
344 | addq %rax, %rdx |
||
345 | movq %mm6, 8(%rdi) /* write r2, r3 */ |
||
346 | |||
347 | addq $16, %rdi |
||
348 | |||
349 | decl %ecx |
||
350 | prefetch 32(%rdx) /* hopefully stride is zero */ |
||
351 | jnz p4_perspective_loop |
||
352 | |||
353 | p4_perspective_done: |
||
354 | femms |
||
355 | ret |
||
356 | |||
357 | .align 16 |
||
358 | .globl _mesa_3dnow_transform_points4_2d_no_rot |
||
359 | .hidden _mesa_3dnow_transform_points4_2d_no_rot |
||
360 | _mesa_3dnow_transform_points4_2d_no_rot: |
||
361 | |||
362 | movl V4F_COUNT(%rdx), %ecx /* count */ |
||
363 | movzbl V4F_STRIDE(%rdx), %eax /* stride */ |
||
364 | |||
365 | movl %ecx, V4F_COUNT(%rdi) /* set dest count */ |
||
366 | movl $4, V4F_SIZE(%rdi) /* set dest size */ |
||
367 | orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ |
||
368 | |||
369 | test %ecx, %ecx |
||
370 | .byte 0x90 /* manual align += 1 */ |
||
371 | jz p4_2d_no_rot_done |
||
372 | |||
373 | movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ |
||
374 | movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ |
||
375 | |||
376 | movd (%rsi), %mm0 /* | m00 */ |
||
377 | prefetch (%rdx) |
||
378 | punpckldq 20(%rsi), %mm0 /* m11 | m00 */ |
||
379 | |||
380 | movq 48(%rsi), %mm1 /* m31 | m30 */ |
||
381 | |||
382 | p4_2d_no_rot_loop: |
||
383 | |||
384 | prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ |
||
385 | |||
386 | movq (%rdx), %mm4 /* x1 | x0 */ |
||
387 | movq 8(%rdx), %mm5 /* x3 | x2 */ |
||
388 | |||
389 | pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ |
||
390 | movq %mm5, %mm6 /* x3 | x2 */ |
||
391 | |||
392 | punpckhdq %mm6, %mm6 /* x3 | x3 */ |
||
393 | |||
394 | addq %rax, %rdx |
||
395 | pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ |
||
396 | |||
397 | prefetch 32(%rdx) /* hopefully stride is zero */ |
||
398 | pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ |
||
399 | |||
400 | movq %mm6, (%rdi) /* write r0, r1 */ |
||
401 | movq %mm5, 8(%rdi) /* write r2, r3 */ |
||
402 | |||
403 | addq $16, %rdi |
||
404 | |||
405 | decl %ecx |
||
406 | jnz p4_2d_no_rot_loop |
||
407 | |||
408 | p4_2d_no_rot_done: |
||
409 | femms |
||
410 | ret |
||
411 | |||
412 | |||
413 | .align 16 |
||
414 | .globl _mesa_3dnow_transform_points4_2d |
||
415 | .hidden _mesa_3dnow_transform_points4_2d |
||
416 | _mesa_3dnow_transform_points4_2d: |
||
417 | |||
418 | movl V4F_COUNT(%rdx), %ecx /* count */ |
||
419 | movzbl V4F_STRIDE(%rdx), %eax /* stride */ |
||
420 | |||
421 | movl %ecx, V4F_COUNT(%rdi) /* set dest count */ |
||
422 | movl $4, V4F_SIZE(%rdi) /* set dest size */ |
||
423 | .byte 0x66, 0x66, 0x90 /* manual align += 4 */ |
||
424 | orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ |
||
425 | |||
426 | test %ecx, %ecx |
||
427 | .byte 0x66, 0x66, 0x90 /* manual align += 4 */ |
||
428 | jz p4_2d_done |
||
429 | |||
430 | movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ |
||
431 | movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ |
||
432 | |||
433 | movd (%rsi), %mm0 /* | m00 */ |
||
434 | movd 4(%rsi), %mm1 /* | m01 */ |
||
435 | |||
436 | prefetch (%rdx) |
||
437 | |||
438 | punpckldq 16(%rsi), %mm0 /* m10 | m00 */ |
||
439 | .byte 0x66, 0x66, 0x90 /* manual align += 4 */ |
||
440 | punpckldq 20(%rsi), %mm1 /* m11 | m01 */ |
||
441 | |||
442 | movq 48(%rsi), %mm2 /* m31 | m30 */ |
||
443 | |||
444 | p4_2d_loop: |
||
445 | |||
446 | prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ |
||
447 | |||
448 | movq (%rdx), %mm3 /* x1 | x0 */ |
||
449 | movq 8(%rdx), %mm5 /* x3 | x2 */ |
||
450 | |||
451 | movq %mm3, %mm4 /* x1 | x0 */ |
||
452 | movq %mm5, %mm6 /* x3 | x2 */ |
||
453 | |||
454 | pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */ |
||
455 | punpckhdq %mm6, %mm6 /* x3 | x3 */ |
||
456 | |||
457 | pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */ |
||
458 | |||
459 | addq %rax, %rdx |
||
460 | pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ |
||
461 | |||
462 | pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ |
||
463 | prefetch 32(%rdx) /* hopefully stride is zero */ |
||
464 | |||
465 | pfadd %mm6, %mm3 /* r1 | r0 */ |
||
466 | |||
467 | movq %mm3, (%rdi) /* write r0, r1 */ |
||
468 | movq %mm5, 8(%rdi) /* write r2, r3 */ |
||
469 | |||
470 | addq $16, %rdi |
||
471 | |||
472 | decl %ecx |
||
473 | jnz p4_2d_loop |
||
474 | |||
475 | p4_2d_done: |
||
476 | femms |
||
477 | ret |
||
478 | |||
479 | #endif |
||
480 | |||
481 | #if defined (__ELF__) && defined (__linux__) |
||
482 | .section .note.GNU-stack,"",%progbits |
||
483 | #endif |