Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
1901 serge 1
/*
2
 * (C) Copyright IBM Corporation 2004
3
 * All Rights Reserved.
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * on the rights to use, copy, modify, merge, publish, distribute, sub
9
 * license, and/or sell copies of the Software, and to permit persons to whom
10
 * the Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19
 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23
 */
24
 
25
/**
26
 * \file read_rgba_span_x86.S
27
 * Optimized routines to transfer pixel data from the framebuffer to a
28
 * buffer in main memory.
29
 *
30
 * \author Ian Romanick 
31
 */
32
 
33
	.file	"read_rgba_span_x86.S"
34
#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
35
/* Kevin F. Quinn 2nd July 2006
36
 * Replaced data segment constants with text-segment instructions.
37
 */
38
#define	LOAD_MASK(mvins,m1,m2) \
39
   	pushl	$0xff00ff00 ;\
40
   	pushl	$0xff00ff00 ;\
41
   	pushl	$0xff00ff00 ;\
42
   	pushl	$0xff00ff00 ;\
43
	mvins	(%esp), m1	;\
44
   	pushl	$0x00ff0000 ;\
45
   	pushl	$0x00ff0000 ;\
46
   	pushl	$0x00ff0000 ;\
47
   	pushl	$0x00ff0000 ;\
48
	mvins	(%esp), m2	;\
49
	addl	$32, %esp
50
 
51
/* I implemented these as macros because they appear in several places,
52
 * and I've tweaked them a number of times.  I got tired of changing every
53
 * place they appear. :)
54
 */
55
 
56
#define DO_ONE_PIXEL() \
57
	movl	(%ebx), %eax ; \
58
	addl	$4, %ebx ; \
59
	bswap	%eax          /* ARGB -> BGRA */ ; \
60
	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
61
	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
62
	addl	$4, %ecx
63
 
64
#define DO_ONE_LAST_PIXEL() \
65
	movl	(%ebx), %eax ; \
66
	bswap	%eax          /* ARGB -> BGRA */ ; \
67
	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
68
	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
69
 
70
 
71
/**
72
 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
73
 *
74
 * \warning
75
 * This function assumes that the caller will issue the EMMS instruction
76
 * at the correct places.
77
 */
78
 
79
.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
80
.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
81
	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
82
_generic_read_RGBA_span_BGRA8888_REV_MMX:
83
	pushl	%ebx
84
 
85
#ifdef USE_INNER_EMMS
86
	emms
87
#endif
88
	LOAD_MASK(movq,%mm1,%mm2)
89
 
90
	movl	8(%esp), %ebx	/* source pointer */
91
	movl	16(%esp), %edx	/* number of pixels to copy */
92
	movl	12(%esp), %ecx	/* destination pointer */
93
 
94
	testl	%edx, %edx
95
	jle	.L20		/* Bail if there's nothing to do. */
96
 
97
	movl	%ebx, %eax
98
 
99
	negl	%eax
100
	sarl	$2, %eax
101
	andl	$1, %eax
102
	je	.L17
103
 
104
	subl	%eax, %edx
105
	DO_ONE_PIXEL()
106
.L17:
107
 
108
	/* Would it be faster to unroll this loop once and process 4 pixels
109
	 * per pass, instead of just two?
110
	 */
111
 
112
	movl	%edx, %eax
113
	shrl	%eax
114
	jmp	.L18
115
.L19:
116
	movq	(%ebx), %mm0
117
	addl	$8, %ebx
118
 
119
	/* These 9 instructions do what PSHUFB (if there were such an
120
	 * instruction) could do in 1. :(
121
	 */
122
 
123
	movq	%mm0, %mm3
124
	movq	%mm0, %mm4
125
 
126
	pand	%mm2, %mm3
127
	psllq	$16, %mm4
128
	psrlq	$16, %mm3
129
	pand	%mm2, %mm4
130
 
131
	pand	%mm1, %mm0
132
	por	%mm4, %mm3
133
	por	%mm3, %mm0
134
 
135
	movq	%mm0, (%ecx)
136
	addl	$8, %ecx
137
	subl	$1, %eax
138
.L18:
139
	jne	.L19
140
 
141
#ifdef USE_INNER_EMMS
142
	emms
143
#endif
144
 
145
	/* At this point there are either 1 or 0 pixels remaining to be
146
	 * converted.  Convert the last pixel, if needed.
147
	 */
148
 
149
	testl	$1, %edx
150
	je	.L20
151
 
152
	DO_ONE_LAST_PIXEL()
153
 
154
.L20:
155
	popl	%ebx
156
	ret
157
	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
158
 
159
 
160
/**
161
 * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
162
 * instructions are only actually used to read data from the framebuffer.
163
 * In practice, the speed-up is pretty small.
164
 *
165
 * \todo
166
 * Do some more testing and determine if there's any reason to have this
167
 * function in addition to the MMX version.
168
 *
169
 * \warning
170
 * This function assumes that the caller will issue the EMMS instruction
171
 * at the correct places.
172
 */
173
 
174
.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
175
.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
176
	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
177
_generic_read_RGBA_span_BGRA8888_REV_SSE:
178
	pushl	%esi
179
	pushl	%ebx
180
	pushl	%ebp
181
 
182
#ifdef USE_INNER_EMMS
183
	emms
184
#endif
185
 
186
	LOAD_MASK(movq,%mm1,%mm2)
187
 
188
	movl	16(%esp), %ebx	/* source pointer */
189
	movl	24(%esp), %edx	/* number of pixels to copy */
190
	movl	20(%esp), %ecx	/* destination pointer */
191
 
192
	testl	%edx, %edx
193
	jle	.L35		/* Bail if there's nothing to do. */
194
 
195
	movl	%esp, %ebp
196
	subl	$16, %esp
197
	andl	$0xfffffff0, %esp
198
 
199
	movl	%ebx, %eax
200
	movl	%edx, %esi
201
 
202
	negl	%eax
203
	andl	$15, %eax
204
	sarl	$2, %eax
205
	cmpl	%edx, %eax
206
	cmovle	%eax, %esi
207
 
208
	subl	%esi, %edx
209
 
210
	testl	$1, %esi
211
	je	.L32
212
 
213
	DO_ONE_PIXEL()
214
.L32:
215
 
216
	testl	$2, %esi
217
	je	.L31
218
 
219
	movq	(%ebx), %mm0
220
	addl	$8, %ebx
221
 
222
	movq	%mm0, %mm3
223
	movq	%mm0, %mm4
224
 
225
	pand	%mm2, %mm3
226
	psllq	$16, %mm4
227
	psrlq	$16, %mm3
228
	pand	%mm2, %mm4
229
 
230
	pand	%mm1, %mm0
231
	por	%mm4, %mm3
232
	por	%mm3, %mm0
233
 
234
	movq	%mm0, (%ecx)
235
	addl	$8, %ecx
236
.L31:
237
 
238
	movl	%edx, %eax
239
	shrl	$2, %eax
240
	jmp	.L33
241
.L34:
242
	movaps	(%ebx), %xmm0
243
	addl	$16, %ebx
244
 
245
	/* This would be so much better if we could just move directly from
246
	 * an SSE register to an MMX register.  Unfortunately, that
247
	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
248
	 * instruction.
249
	 */
250
 
251
	movaps	%xmm0, (%esp)
252
	movq	(%esp), %mm0
253
	movq	8(%esp), %mm5
254
 
255
	movq	%mm0, %mm3
256
	movq	%mm0, %mm4
257
	movq	%mm5, %mm6
258
	movq	%mm5, %mm7
259
 
260
	pand	%mm2, %mm3
261
	pand	%mm2, %mm6
262
 
263
	psllq	$16, %mm4
264
	psllq	$16, %mm7
265
 
266
	psrlq	$16, %mm3
267
	psrlq	$16, %mm6
268
 
269
	pand	%mm2, %mm4
270
	pand	%mm2, %mm7
271
 
272
	pand	%mm1, %mm0
273
	pand	%mm1, %mm5
274
 
275
	por	%mm4, %mm3
276
	por	%mm7, %mm6
277
 
278
	por	%mm3, %mm0
279
	por	%mm6, %mm5
280
 
281
	movq	%mm0, (%ecx)
282
	movq	%mm5, 8(%ecx)
283
	addl	$16, %ecx
284
 
285
	subl	$1, %eax
286
.L33:
287
	jne	.L34
288
 
289
#ifdef USE_INNER_EMMS
290
	emms
291
#endif
292
	movl	%ebp, %esp
293
 
294
	/* At this point there are either [0, 3] pixels remaining to be
295
	 * converted.
296
	 */
297
 
298
	testl	$2, %edx
299
	je	.L36
300
 
301
	movq	(%ebx), %mm0
302
	addl	$8, %ebx
303
 
304
	movq	%mm0, %mm3
305
	movq	%mm0, %mm4
306
 
307
	pand	%mm2, %mm3
308
	psllq	$16, %mm4
309
	psrlq	$16, %mm3
310
	pand	%mm2, %mm4
311
 
312
	pand	%mm1, %mm0
313
	por	%mm4, %mm3
314
	por	%mm3, %mm0
315
 
316
	movq	%mm0, (%ecx)
317
	addl	$8, %ecx
318
.L36:
319
 
320
	testl	$1, %edx
321
	je	.L35
322
 
323
	DO_ONE_LAST_PIXEL()
324
.L35:
325
	popl	%ebp
326
	popl	%ebx
327
	popl	%esi
328
	ret
329
	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
330
 
331
 
332
/**
333
 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
334
 */
335
 
336
	.text
337
.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
338
.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
339
	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
340
_generic_read_RGBA_span_BGRA8888_REV_SSE2:
341
	pushl	%esi
342
	pushl	%ebx
343
 
344
	LOAD_MASK(movdqu,%xmm1,%xmm2)
345
 
346
	movl	12(%esp), %ebx	/* source pointer */
347
	movl	20(%esp), %edx	/* number of pixels to copy */
348
	movl	16(%esp), %ecx	/* destination pointer */
349
 
350
	movl	%ebx, %eax
351
	movl	%edx, %esi
352
 
353
	testl	%edx, %edx
354
	jle	.L46		/* Bail if there's nothing to do. */
355
 
356
	/* If the source pointer isn't a multiple of 16 we have to process
357
	 * a few pixels the "slow" way to get the address aligned for
358
	 * the SSE fetch intsructions.
359
	 */
360
 
361
	negl	%eax
362
	andl	$15, %eax
363
	sarl	$2, %eax
364
 
365
	cmpl	%edx, %eax
366
	cmovbe	%eax, %esi
367
	subl	%esi, %edx
368
 
369
	testl	$1, %esi
370
	je	.L41
371
 
372
	DO_ONE_PIXEL()
373
.L41:
374
	testl	$2, %esi
375
	je	.L40
376
 
377
	movq	(%ebx), %xmm0
378
	addl	$8, %ebx
379
 
380
	movdqa	%xmm0, %xmm3
381
	movdqa	%xmm0, %xmm4
382
	andps	%xmm1, %xmm0
383
 
384
	andps	%xmm2, %xmm3
385
	pslldq	$2, %xmm4
386
	psrldq	$2, %xmm3
387
	andps	%xmm2, %xmm4
388
 
389
	orps	%xmm4, %xmm3
390
	orps	%xmm3, %xmm0
391
 
392
	movq	%xmm0, (%ecx)
393
	addl	$8, %ecx
394
.L40:
395
 
396
	/* Would it be worth having a specialized version of this loop for
397
	 * the case where the destination is 16-byte aligned?  That version
398
	 * would be identical except that it could use movedqa instead of
399
	 * movdqu.
400
	 */
401
 
402
	movl	%edx, %eax
403
	shrl	$2, %eax
404
	jmp	.L42
405
.L43:
406
	movdqa	(%ebx), %xmm0
407
	addl	$16, %ebx
408
 
409
	movdqa	%xmm0, %xmm3
410
	movdqa	%xmm0, %xmm4
411
	andps	%xmm1, %xmm0
412
 
413
	andps	%xmm2, %xmm3
414
	pslldq	$2, %xmm4
415
	psrldq	$2, %xmm3
416
	andps	%xmm2, %xmm4
417
 
418
	orps	%xmm4, %xmm3
419
	orps	%xmm3, %xmm0
420
 
421
	movdqu	%xmm0, (%ecx)
422
	addl	$16, %ecx
423
	subl	$1, %eax
424
.L42:
425
	jne	.L43
426
 
427
 
428
	/* There may be upto 3 pixels remaining to be copied.  Take care
429
	 * of them now.  We do the 2 pixel case first because the data
430
	 * will be aligned.
431
	 */
432
 
433
	testl	$2, %edx
434
	je	.L47
435
 
436
	movq	(%ebx), %xmm0
437
	addl	$8, %ebx
438
 
439
	movdqa	%xmm0, %xmm3
440
	movdqa	%xmm0, %xmm4
441
	andps	%xmm1, %xmm0
442
 
443
	andps	%xmm2, %xmm3
444
	pslldq	$2, %xmm4
445
	psrldq	$2, %xmm3
446
	andps	%xmm2, %xmm4
447
 
448
	orps	%xmm4, %xmm3
449
	orps	%xmm3, %xmm0
450
 
451
	movq	%xmm0, (%ecx)
452
	addl	$8, %ecx
453
.L47:
454
 
455
	testl	$1, %edx
456
	je	.L46
457
 
458
	DO_ONE_LAST_PIXEL()
459
.L46:
460
 
461
	popl	%ebx
462
	popl	%esi
463
	ret
464
	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
465
 
466
 
467
 
468
#define MASK_565_L	0x07e0f800
469
#define MASK_565_H	0x0000001f
470
/* Setting SCALE_ADJUST to 5 gives a perfect match with the
471
 * classic C implementation in Mesa.  Setting SCALE_ADJUST
472
 * to 0 is slightly faster but at a small cost to accuracy.
473
 */
474
#define SCALE_ADJUST	5
475
#if SCALE_ADJUST == 5
476
#define PRESCALE_L 0x00100001
477
#define PRESCALE_H 0x00000200
478
#define SCALE_L 0x40C620E8
479
#define SCALE_H 0x0000839d
480
#elif SCALE_ADJUST == 0
481
#define PRESCALE_L 0x00200001
482
#define PRESCALE_H 0x00000800
483
#define SCALE_L 0x01040108
484
#define SCALE_H 0x00000108
485
#else
486
#error SCALE_ADJUST must either be 5 or 0.
487
#endif
488
#define ALPHA_L 0x00000000
489
#define ALPHA_H 0x00ff0000
490
 
491
/**
492
 * MMX optimized version of the RGB565 to RGBA copy routine.
493
 */
494
 
495
	.text
496
	.globl	_generic_read_RGBA_span_RGB565_MMX
497
        .hidden _generic_read_RGBA_span_RGB565_MMX
498
	.type	_generic_read_RGBA_span_RGB565_MMX, @function
499
 
500
_generic_read_RGBA_span_RGB565_MMX:
501
 
502
#ifdef USE_INNER_EMMS
503
	emms
504
#endif
505
 
506
	movl	4(%esp), %eax	/* source pointer */
507
	movl	8(%esp), %edx	/* destination pointer */
508
	movl	12(%esp), %ecx	/* number of pixels to copy */
509
 
510
	pushl	$MASK_565_H
511
	pushl	$MASK_565_L
512
	movq	(%esp), %mm5
513
	pushl	$PRESCALE_H
514
	pushl	$PRESCALE_L
515
	movq	(%esp), %mm6
516
	pushl	$SCALE_H
517
	pushl	$SCALE_L
518
	movq	(%esp), %mm7
519
	pushl	$ALPHA_H
520
	pushl	$ALPHA_L
521
	movq	(%esp), %mm3
522
	addl	$32,%esp
523
 
524
	sarl	$2, %ecx
525
	jl	.L01		/* Bail early if the count is negative. */
526
	jmp	.L02
527
 
528
.L03:
529
	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
530
	 * second pixels into the four words of %mm0 and %mm2.
531
      	 */
532
 
533
	movq	(%eax), %mm4
534
	addl	$8, %eax
535
 
536
	pshufw	$0x00, %mm4, %mm0
537
	pshufw	$0x55, %mm4, %mm2
538
 
539
 
540
	/* Mask the pixels so that each word of each register contains only
541
	 * one color component.
542
	 */
543
 
544
	pand	%mm5, %mm0
545
	pand	%mm5, %mm2
546
 
547
 
548
	/* Adjust the component values so that they are as small as possible,
549
	 * but large enough so that we can multiply them by an unsigned 16-bit
550
	 * number and get a value as large as 0x00ff0000.
551
 	 */
552
 
553
	pmullw	%mm6, %mm0
554
	pmullw	%mm6, %mm2
555
#if SCALE_ADJUST > 0
556
	psrlw	$SCALE_ADJUST, %mm0
557
	psrlw	$SCALE_ADJUST, %mm2
558
#endif
559
 
560
	/* Scale the input component values to be on the range
561
	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
562
	 */
563
 
564
	pmulhuw	%mm7, %mm0
565
	pmulhuw	%mm7, %mm2
566
 
567
 
568
	/* Always set the alpha value to 0xff.
569
	 */
570
 
571
 	por %mm3, %mm0
572
 	por %mm3, %mm2
573
 
574
 
575
	/* Pack the 16-bit values to 8-bit values and store the converted
576
	 * pixel data.
577
	 */
578
 
579
	packuswb	%mm2, %mm0
580
	movq	%mm0, (%edx)
581
	addl	$8, %edx
582
 
583
	pshufw	$0xaa, %mm4, %mm0
584
	pshufw	$0xff, %mm4, %mm2
585
 
586
	pand	%mm5, %mm0
587
	pand	%mm5, %mm2
588
	pmullw	%mm6, %mm0
589
	pmullw	%mm6, %mm2
590
#if SCALE_ADJUST > 0
591
	psrlw	$SCALE_ADJUST, %mm0
592
	psrlw	$SCALE_ADJUST, %mm2
593
#endif
594
	pmulhuw	%mm7, %mm0
595
	pmulhuw	%mm7, %mm2
596
 
597
 	por %mm3, %mm0
598
 	por %mm3, %mm2
599
 
600
	packuswb	%mm2, %mm0
601
 
602
	movq	%mm0, (%edx)
603
	addl	$8, %edx
604
 
605
	subl	$1, %ecx
606
.L02:
607
	jne	.L03
608
 
609
 
610
	/* At this point there can be at most 3 pixels left to process.  If
611
	 * there is either 2 or 3 left, process 2.
612
         */
613
 
614
	movl	12(%esp), %ecx
615
	testl	$0x02, %ecx
616
	je	.L04
617
 
618
	movd	(%eax), %mm4
619
	addl	$4, %eax
620
 
621
	pshufw	$0x00, %mm4, %mm0
622
	pshufw	$0x55, %mm4, %mm2
623
 
624
	pand	%mm5, %mm0
625
	pand	%mm5, %mm2
626
	pmullw	%mm6, %mm0
627
	pmullw	%mm6, %mm2
628
#if SCALE_ADJUST > 0
629
	psrlw	$SCALE_ADJUST, %mm0
630
	psrlw	$SCALE_ADJUST, %mm2
631
#endif
632
	pmulhuw	%mm7, %mm0
633
	pmulhuw	%mm7, %mm2
634
 
635
 	por %mm3, %mm0
636
 	por %mm3, %mm2
637
 
638
	packuswb	%mm2, %mm0
639
 
640
	movq	%mm0, (%edx)
641
	addl	$8, %edx
642
 
643
.L04:
644
	/* At this point there can be at most 1 pixel left to process.
645
	 * Process it if needed.
646
         */
647
 
648
	testl	$0x01, %ecx
649
	je	.L01
650
 
651
	movzwl	(%eax), %ecx
652
	movd	%ecx, %mm4
653
 
654
	pshufw	$0x00, %mm4, %mm0
655
 
656
	pand	%mm5, %mm0
657
	pmullw	%mm6, %mm0
658
#if SCALE_ADJUST > 0
659
	psrlw	$SCALE_ADJUST, %mm0
660
#endif
661
	pmulhuw	%mm7, %mm0
662
 
663
 	por %mm3, %mm0
664
 
665
	packuswb	%mm0, %mm0
666
 
667
	movd	%mm0, (%edx)
668
 
669
.L01:
670
#ifdef USE_INNER_EMMS
671
	emms
672
#endif
673
	ret
674
#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
675
 
676
#if defined (__ELF__) && defined (__linux__)
677
	.section .note.GNU-stack,"",%progbits
678
#endif