Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5564 serge 1
/*
2
 * Mesa 3-D graphics library
3
 *
4
 * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a
7
 * copy of this software and associated documentation files (the "Software"),
8
 * to deal in the Software without restriction, including without limitation
9
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10
 * and/or sell copies of the Software, and to permit persons to whom the
11
 * Software is furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included
14
 * in all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
 * OTHER DEALINGS IN THE SOFTWARE.
23
 */
24
 
25
#ifdef USE_X86_64_ASM
26
 
27
#include "matypes.h"
28
 
29
.text
30
 
31
.align 16
32
.globl _mesa_x86_64_cpuid
33
.hidden _mesa_x86_64_cpuid
34
_mesa_x86_64_cpuid:
35
	pushq	%rbx
36
	movl	(%rdi), %eax
37
	movl	8(%rdi), %ecx
38
 
39
	cpuid
40
 
41
	movl	%ebx, 4(%rdi)
42
	movl	%eax, (%rdi)
43
	movl	%ecx, 8(%rdi)
44
	movl	%edx, 12(%rdi)
45
	popq	%rbx
46
	ret
47
 
48
.align 16
49
.globl _mesa_x86_64_transform_points4_general
50
.hidden _mesa_x86_64_transform_points4_general
51
_mesa_x86_64_transform_points4_general:
52
/*
53
 *	rdi = dest
54
 *	rsi = matrix
55
 *	rdx = source
56
 */
57
	movl V4F_COUNT(%rdx), %ecx	/* count */
58
	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
59
 
60
	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
61
	movl $4, V4F_SIZE(%rdi)		/* set dest size */
62
	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
63
	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
64
 
65
	testl %ecx, %ecx		/* verify non-zero count */
66
	prefetchnta 64(%rsi)
67
	jz p4_general_done
68
 
69
	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
70
	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
71
 
72
	prefetch 16(%rdx)
73
 
74
	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
75
	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
76
	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
77
	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
78
        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
79
 
80
p4_general_loop:
81
 
82
	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
83
	prefetchw 16(%rdi)
84
 
85
	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
86
	addq %rax, %rdx
87
	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
88
	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
89
	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
90
	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
91
	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
92
	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
93
	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
94
	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
95
	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
96
	prefetch 16(%rdx)
97
	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
98
 
99
	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
100
	addq $16, %rdi
101
 
102
	decl %ecx
103
	jnz p4_general_loop
104
 
105
p4_general_done:
106
	.byte 0xf3
107
	ret
108
 
109
.section .rodata
110
 
111
.align 16
112
p4_constants:
113
.byte  0xff, 0xff, 0xff, 0xff
114
.byte  0xff, 0xff, 0xff, 0xff
115
.byte  0xff, 0xff, 0xff, 0xff
116
.byte  0x00, 0x00, 0x00, 0x00
117
 
118
.byte  0x00, 0x00, 0x00, 0x00
119
.byte  0x00, 0x00, 0x00, 0x00
120
.byte  0x00, 0x00, 0x00, 0x00
121
.float 1.0
122
 
123
.text
124
.align 16
125
.globl _mesa_x86_64_transform_points4_3d
126
.hidden _mesa_x86_64_transform_points4_3d
127
/*
128
 * this is slower than _mesa_x86_64_transform_points4_general
129
 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
130
 */
131
_mesa_x86_64_transform_points4_3d:
132
 
133
	leaq p4_constants(%rip), %rax
134
 
135
	prefetchnta 64(%rsi)
136
 
137
	movaps (%rax), %xmm9
138
	movaps 16(%rax), %xmm10
139
 
140
	movl V4F_COUNT(%rdx), %ecx	/* count */
141
	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
142
 
143
	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
144
	movl $4, V4F_SIZE(%rdi)		/* set dest size */
145
	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
146
 
147
	testl %ecx, %ecx		/* verify non-zero count */
148
	jz p4_3d_done
149
 
150
	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
151
	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
152
 
153
	prefetch 16(%rdx)
154
 
155
	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
156
	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
157
	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
158
	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
159
	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
160
        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
161
	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
162
	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
163
	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
164
	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
165
 
166
p4_3d_loop:
167
 
168
	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
169
	prefetchw 16(%rdi)
170
 
171
	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
172
	addq %rax, %rdx
173
	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
174
	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
175
	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
176
	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
177
	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
178
	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
179
	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
180
	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
181
	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
182
	prefetch 16(%rdx)
183
	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
184
 
185
	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
186
	addq $16, %rdi
187
 
188
	dec %ecx
189
	jnz p4_3d_loop
190
 
191
p4_3d_done:
192
	.byte 0xf3
193
	ret
194
 
195
 
196
.align 16
197
.globl _mesa_x86_64_transform_points4_identity
198
.hidden _mesa_x86_64_transform_points4_identity
199
_mesa_x86_64_transform_points4_identity:
200
 
201
	movl V4F_COUNT(%rdx), %ecx	/* count */
202
	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
203
 
204
	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
205
	movl $4, V4F_SIZE(%rdi)		/* set dest size */
206
	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
207
 
208
	test %ecx, %ecx
209
	jz p4_identity_done
210
 
211
	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
212
	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
213
	prefetch 64(%rsi)
214
	prefetchw 64(%rdi)
215
 
216
	add %ecx, %ecx
217
 
218
	rep movsq
219
 
220
p4_identity_done:
221
	.byte 0xf3
222
	ret
223
 
224
 
225
.align 16
226
.globl _mesa_3dnow_transform_points4_3d_no_rot
227
.hidden _mesa_3dnow_transform_points4_3d_no_rot
228
_mesa_3dnow_transform_points4_3d_no_rot:
229
 
230
	movl V4F_COUNT(%rdx), %ecx	/* count */
231
	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
232
 
233
	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
234
	movl $4, V4F_SIZE(%rdi)		/* set dest size */
235
	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
236
	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
237
 
238
	test %ecx, %ecx
239
	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
240
	jz p4_3d_no_rot_done
241
 
242
	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
243
	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
244
 
245
	prefetch (%rdx)
246
 
247
	movd (%rsi), %mm0		/*                 | m00             */
248
	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
249
	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
250
 
251
	movd 40(%rsi), %mm2		/*                 | m22             */
252
	movq 48(%rsi), %mm1		/* m31             | m30             */
253
 
254
	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
255
 
256
p4_3d_no_rot_loop:
257
 
258
	prefetchw 32(%rdi)
259
 
260
	movq  (%rdx), %mm4		/* x1              | x0              */
261
	movq  8(%rdx), %mm5		/* x3              | x2              */
262
	movd  12(%rdx), %mm7		/*                 | x3              */
263
 
264
	movq  %mm5, %mm6		/* x3              | x2              */
265
	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
266
 
267
	punpckhdq %mm6, %mm6		/* x3              | x3              */
268
	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
269
 
270
	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
271
	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
272
 
273
        pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
274
 
275
	addq %rax, %rdx
276
	movq %mm4, (%rdi)		/* write r0, r1                      */
277
	movq %mm5, 8(%rdi)		/* write r2, r3                      */
278
 
279
	addq $16, %rdi
280
 
281
	decl %ecx
282
	prefetch 32(%rdx)
283
	jnz p4_3d_no_rot_loop
284
 
285
p4_3d_no_rot_done:
286
	femms
287
	ret
288
 
289
 
290
.align 16
291
.globl _mesa_3dnow_transform_points4_perspective
292
.hidden _mesa_3dnow_transform_points4_perspective
293
_mesa_3dnow_transform_points4_perspective:
294
 
295
	movl V4F_COUNT(%rdx), %ecx	/* count */
296
	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
297
 
298
	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
299
	movl $4, V4F_SIZE(%rdi)		/* set dest size */
300
	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
301
 
302
	test %ecx, %ecx
303
	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
304
	jz p4_perspective_done
305
 
306
	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
307
	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
308
 
309
	movd (%rsi), %mm0		/*                 | m00             */
310
        pxor %mm7, %mm7			/* 0               | 0               */
311
	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
312
 
313
	movq 32(%rsi), %mm2		/* m21             | m20             */
314
	prefetch (%rdx)
315
 
316
	movd 40(%rsi), %mm1		/*                 | m22             */
317
 
318
	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
319
	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
320
 
321
 
322
p4_perspective_loop:
323
 
324
	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
325
 
326
	movq (%rdx), %mm4		/* x1              | x0              */
327
	movq 8(%rdx), %mm5		/* x3              | x2              */
328
	movd 8(%rdx), %mm3		/*                 | x2              */
329
 
330
	movq %mm5, %mm6			/* x3              | x2              */
331
	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
332
 
333
	punpckldq %mm5, %mm5		/* x2              | x2              */
334
 
335
	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
336
	pfsubr %mm7, %mm3		/*                 | -x2             */
337
 
338
	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
339
	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
340
 
341
	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
342
 
343
	movq %mm5, (%rdi)		/* write r0, r1                      */
344
	addq %rax, %rdx
345
	movq %mm6, 8(%rdi)		/* write r2, r3                      */
346
 
347
	addq $16, %rdi
348
 
349
	decl %ecx
350
	prefetch 32(%rdx)		/* hopefully stride is zero          */
351
	jnz p4_perspective_loop
352
 
353
p4_perspective_done:
354
	femms
355
	ret
356
 
357
.align 16
358
.globl _mesa_3dnow_transform_points4_2d_no_rot
359
.hidden _mesa_3dnow_transform_points4_2d_no_rot
360
_mesa_3dnow_transform_points4_2d_no_rot:
361
 
362
	movl V4F_COUNT(%rdx), %ecx	/* count */
363
	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
364
 
365
	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
366
	movl $4, V4F_SIZE(%rdi)		/* set dest size */
367
	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
368
 
369
	test %ecx, %ecx
370
	.byte 0x90			/* manual align += 1 */
371
	jz p4_2d_no_rot_done
372
 
373
	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
374
	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
375
 
376
	movd (%rsi), %mm0		/*                 | m00             */
377
	prefetch (%rdx)
378
	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
379
 
380
	movq 48(%rsi), %mm1		/* m31             | m30             */
381
 
382
p4_2d_no_rot_loop:
383
 
384
	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
385
 
386
	movq (%rdx), %mm4		/* x1              | x0              */
387
	movq 8(%rdx), %mm5		/* x3              | x2              */
388
 
389
	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
390
	movq %mm5, %mm6			/* x3              | x2              */
391
 
392
	punpckhdq %mm6, %mm6		/* x3              | x3              */
393
 
394
	addq %rax, %rdx
395
	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
396
 
397
	prefetch 32(%rdx)		/* hopefully stride is zero          */
398
	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
399
 
400
	movq %mm6, (%rdi)		/* write r0, r1                      */
401
	movq %mm5, 8(%rdi)		/* write r2, r3                      */
402
 
403
	addq $16, %rdi
404
 
405
	decl %ecx
406
	jnz p4_2d_no_rot_loop
407
 
408
p4_2d_no_rot_done:
409
	femms
410
	ret
411
 
412
 
413
.align 16
414
.globl _mesa_3dnow_transform_points4_2d
415
.hidden _mesa_3dnow_transform_points4_2d
416
_mesa_3dnow_transform_points4_2d:
417
 
418
	movl V4F_COUNT(%rdx), %ecx	/* count */
419
	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
420
 
421
	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
422
	movl $4, V4F_SIZE(%rdi)		/* set dest size */
423
	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
424
	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
425
 
426
	test %ecx, %ecx
427
	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
428
	jz p4_2d_done
429
 
430
	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
431
	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
432
 
433
	movd (%rsi), %mm0		/*                 | m00             */
434
	movd 4(%rsi), %mm1		/*                 | m01             */
435
 
436
	prefetch (%rdx)
437
 
438
	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
439
	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
440
	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
441
 
442
	movq 48(%rsi), %mm2		/* m31             | m30             */
443
 
444
p4_2d_loop:
445
 
446
	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
447
 
448
	movq (%rdx), %mm3		/* x1              | x0              */
449
	movq 8(%rdx), %mm5		/* x3              | x2              */
450
 
451
	movq %mm3, %mm4			/* x1              | x0              */
452
	movq %mm5, %mm6			/* x3              | x2              */
453
 
454
	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
455
	punpckhdq %mm6, %mm6		/* x3              | x3              */
456
 
457
	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
458
 
459
	addq %rax, %rdx
460
	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
461
 
462
	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
463
	prefetch 32(%rdx)		/* hopefully stride is zero          */
464
 
465
	pfadd %mm6, %mm3		/* r1              | r0              */
466
 
467
	movq %mm3, (%rdi)		/* write r0, r1                      */
468
	movq %mm5, 8(%rdi)		/* write r2, r3                      */
469
 
470
	addq $16, %rdi
471
 
472
	decl %ecx
473
	jnz p4_2d_loop
474
 
475
p4_2d_done:
476
	femms
477
	ret
478
 
479
#endif
480
 
481
#if defined (__ELF__) && defined (__linux__)
482
	.section .note.GNU-stack,"",%progbits
483
#endif