Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4358 Serge 1
;
2
/*
3
 * Written by Jos� Fonseca 
4
 */
5
 
6
 
7
#ifdef USE_MMX_ASM
8
#include "assyntax.h"
9
#include "matypes.h"
10
 
11
/* integer multiplication - alpha plus one
12
 *
13
 * makes the following approximation to the division (Sree)
14
 *
15
 *   rgb*a/255 ~= (rgb*(a+1)) >> 256
16
 *
17
 * which is the fastest method that satisfies the following OpenGL criteria
18
 *
19
 *   0*0 = 0 and 255*255 = 255
20
 *
21
 * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making
22
 *
23
 *   PCMPEQW    ( MX1, MX1 )
24
 */
25
#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \
26
    PSUBW      ( MX1, MA1 )			/*   a1 + 1  |   a1 + 1  |   a1 + 1  |   a1 + 1  */	;\
27
    PMULLW     ( MP1, MA1 )			/*                  t1 = p1*a1                   */	;\
28
													;\
29
TWO(PSUBW      ( MX1, MA2 ))			/*   a2 + 1  |   a2 + 1  |   a2 + 1  |   a2 + 1  */	;\
30
TWO(PMULLW     ( MP2, MA2 ))			/*                  t2 = p2*a2                   */	;\
31
													;\
32
    PSRLW      ( CONST(8), MA1 )		/*               t1 >> 8 ~= t1/255               */	;\
33
TWO(PSRLW      ( CONST(8), MA2 ))		/*               t2 >> 8 ~= t2/255               */
34
 
35
 
36
/* integer multiplication - geometric series
37
 *
38
 * takes the geometric series approximation to the division
39
 *
40
 *   t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
41
 *
42
 * in this case just the first two terms to fit in 16bit arithmetic
43
 *
44
 *   t/255 ~= (t + (t >> 8)) >> 8
45
 *
46
 * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254,
47
 * so the special case a = 255 must be accounted or roundoff must be used
48
 */
49
#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \
50
    PMULLW     ( MP1, MA1 )			/*                  t1 = p1*a1                   */	;\
51
TWO(PMULLW     ( MP2, MA2 ))			/*                  t2 = p2*a2                   */	;\
52
													;\
53
    MOVQ       ( MA1, MP1 )										;\
54
    PSRLW      ( CONST(8), MA1 )		/*                    t1 >> 8                    */	;\
55
													;\
56
TWO(MOVQ       ( MA2, MP2 ))										;\
57
TWO(PSRLW      ( CONST(8), MA2 ))		/*                    t2 >> 8                    */	;\
58
													;\
59
    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
60
    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
61
													;\
62
TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
63
TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
64
 
65
 
66
/* integer multiplication - geometric series plus rounding
67
 *
68
 * when using a geometric series division instead of truncating the result
69
 * use roundoff in the approximation (Jim Blinn)
70
 *
71
 *   t = rgb*a + 0x80
72
 *
73
 * achieving the exact results
74
 *
75
 * note that M80 is register with the 0x0080008000800080 constant
76
 */
77
#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \
78
    PMULLW     ( MP1, MA1 )			/*                  t1 = p1*a1                   */	;\
79
    PADDW      ( M80, MA1 )			/*                 t1 += 0x80                    */	;\
80
													;\
81
TWO(PMULLW     ( MP2, MA2 ))			/*                  t2 = p2*a2                   */	;\
82
TWO(PADDW      ( M80, MA2 ))			/*                 t2 += 0x80                    */	;\
83
													;\
84
    MOVQ       ( MA1, MP1 )										;\
85
    PSRLW      ( CONST(8), MA1 )		/*                    t1 >> 8                    */	;\
86
													;\
87
TWO(MOVQ       ( MA2, MP2 ))										;\
88
TWO(PSRLW      ( CONST(8), MA2 ))		/*                    t2 >> 8                    */	;\
89
													;\
90
    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
91
    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
92
													;\
93
TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
94
TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
95
 
96
 
97
/* linear interpolation - geometric series
98
 */
99
#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \
100
    PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */	;\
101
    PSLLW      ( CONST(8), MQ1 )		/*                    q1 << 8                    */	;\
102
    PMULLW     ( MP1, MA1 )			/*              t1 = (q1 - p1)*pa1               */	;\
103
													;\
104
TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */	;\
105
TWO(PSLLW      ( CONST(8), MQ2 ))		/*                    q2 << 8                    */	;\
106
TWO(PMULLW     ( MP2, MA2 ))			/*              t2 = (q2 - p2)*pa2               */	;\
107
													;\
108
    MOVQ       ( MA1, MP1 )										;\
109
    PSRLW      ( CONST(8), MA1 )		/*                    t1 >> 8                    */	;\
110
													;\
111
TWO(MOVQ       ( MA2, MP2 ))										;\
112
TWO(PSRLW      ( CONST(8), MA2 ))		/*                    t2 >> 8                    */	;\
113
													;\
114
    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
115
TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
116
													;\
117
    PADDW      ( MQ1, MA1 )			/*              (t1/255 + q1) << 8               */	;\
118
TWO(PADDW      ( MQ2, MA2 ))			/*              (t2/255 + q2) << 8               */	;\
119
													;\
120
    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
121
TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
122
 
123
 
124
/* linear interpolation - geometric series with roundoff
125
 *
126
 * this is a generalization of Blinn's formula to signed arithmetic
127
 *
128
 * note that M80 is a register with the 0x0080008000800080 constant
129
 */
130
#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \
131
    PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */	;\
132
    PSLLW      ( CONST(8), MQ1 )		/*                    q1 << 8                    */	;\
133
    PMULLW     ( MP1, MA1 )			/*              t1 = (q1 - p1)*pa1               */	;\
134
													;\
135
TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */	;\
136
TWO(PSLLW      ( CONST(8), MQ2 ))		/*                    q2 << 8                    */	;\
137
TWO(PMULLW     ( MP2, MA2 ))			/*              t2 = (q2 - p2)*pa2               */	;\
138
													;\
139
    PSRLW      ( CONST(15), MP1 )		/*                 q1 > p1 ? 1 : 0               */	;\
140
TWO(PSRLW      ( CONST(15), MP2 ))		/*                 q2 > q2 ? 1 : 0               */	;\
141
													;\
142
    PSLLW      ( CONST(8), MP1 )		/*             q1 > p1 ? 0x100 : 0               */	;\
143
TWO(PSLLW      ( CONST(8), MP2 ))		/*             q2 > q2 ? 0x100 : 0               */	;\
144
													;\
145
    PSUBW      ( MP1, MA1 )			/*                  t1 -=? 0x100                 */	;\
146
TWO(PSUBW      ( MP2, MA2 ))			/*                  t2 -=? 0x100                 */	;\
147
 													;\
148
    PADDW      ( M80, MA1 )			/*                 t1 += 0x80                    */	;\
149
TWO(PADDW      ( M80, MA2 ))			/*                 t2 += 0x80                    */	;\
150
													;\
151
    MOVQ       ( MA1, MP1 )										;\
152
    PSRLW      ( CONST(8), MA1 )		/*                    t1 >> 8                    */	;\
153
													;\
154
TWO(MOVQ       ( MA2, MP2 ))										;\
155
TWO(PSRLW      ( CONST(8), MA2 ))		/*                    t2 >> 8                    */	;\
156
													;\
157
    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
158
TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
159
													;\
160
    PADDW      ( MQ1, MA1 )			/*              (t1/255 + q1) << 8               */	;\
161
TWO(PADDW      ( MQ2, MA2 ))			/*              (t2/255 + q2) << 8               */	;\
162
													;\
163
    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
164
TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
165
 
166
 
167
/* linear interpolation - geometric series with correction
168
 *
169
 * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
170
 *
171
 *   t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8
172
 *
173
 * note that although is faster than rounding off it doesn't give always the exact results
174
 */
175
#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \
176
    PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */	;\
177
    PSLLW      ( CONST(8), MQ1 )		/*                    q1 << 8                    */	;\
178
    PMULLW     ( MP1, MA1 )			/*              t1 = (q1 - p1)*pa1               */	;\
179
													;\
180
TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */	;\
181
TWO(PSLLW      ( CONST(8), MQ2 ))		/*                    q2 << 8                    */	;\
182
TWO(PMULLW     ( MP2, MA2 ))			/*              t2 = (q2 - p2)*pa2               */	;\
183
													;\
184
    MOVQ       ( MA1, MP1 )										;\
185
    PSRLW      ( CONST(8), MA1 )		/*                    t1 >> 8                    */	;\
186
													;\
187
TWO(MOVQ       ( MA2, MP2 ))										;\
188
TWO(PSRLW      ( CONST(8), MA2 ))		/*                    t2 >> 8                    */	;\
189
													;\
190
    PADDW      ( MA1, MP1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
191
    PSRLW      ( CONST(7), MA1 )		/*                    t1 >> 15                   */	;\
192
													;\
193
TWO(PADDW      ( MA2, MP2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
194
TWO(PSRLW      ( CONST(7), MA2 ))		/*                    t2 >> 15                   */	;\
195
													;\
196
    PADDW      ( MP1, MA1 )			/*  t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8  */	;\
197
TWO(PADDW      ( MP2, MA2 ))			/*  t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8  */	;\
198
													;\
199
    PADDW      ( MQ1, MA1 )			/*              (t1/255 + q1) << 8               */	;\
200
TWO(PADDW      ( MQ2, MA2 ))			/*              (t2/255 + q2) << 8               */	;\
201
													;\
202
    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
203
TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
204
 
205
 
206
/* common blending setup code
207
 *
208
 * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making
209
 *
210
 *   PXOR      ( M00, M00 )
211
 */
212
#define GMB_LOAD(rgba, dest, MPP, MQQ) \
213
ONE(MOVD       ( REGIND(rgba), MPP ))		/*     |     |     |     | qa1 | qb1 | qg1 | qr1 */	;\
214
ONE(MOVD       ( REGIND(dest), MQQ ))		/*     |     |     |     | pa1 | pb1 | pg1 | pr1 */	;\
215
													;\
216
TWO(MOVQ       ( REGIND(rgba), MPP ))		/* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */	;\
217
TWO(MOVQ       ( REGIND(dest), MQQ ))		/* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */
218
 
219
#define GMB_UNPACK(MP1, MQ1, MP2, MQ2, M00) \
220
TWO(MOVQ       ( MP1, MP2 ))										;\
221
TWO(MOVQ       ( MQ1, MQ2 ))										;\
222
													;\
223
    PUNPCKLBW  ( M00, MQ1 )			/*    qa1    |    qb1    |    qg1    |    qr1    */	;\
224
TWO(PUNPCKHBW  ( M00, MQ2 ))                    /*    qa2    |    qb2    |    qg2    |    qr2    */	;\
225
    PUNPCKLBW  ( M00, MP1 )			/*    pa1    |    pb1    |    pg1    |    pr1    */	;\
226
TWO(PUNPCKHBW  ( M00, MP2 ))                    /*    pa2    |    pb2    |    pg2    |    pr2    */
227
 
228
#define GMB_ALPHA(MP1, MA1, MP2, MA2) \
229
    MOVQ       ( MP1, MA1 )										;\
230
TWO(MOVQ       ( MP2, MA2 ))										;\
231
													;\
232
    PUNPCKHWD  ( MA1, MA1 )			/*    pa1    |    pa1    |           |           */	;\
233
TWO(PUNPCKHWD  ( MA2, MA2 ))			/*    pa2    |    pa2    |           |           */	;\
234
    PUNPCKHDQ  ( MA1, MA1 )                     /*    pa1    |    pa1    |    pa1    |    pa1    */	;\
235
TWO(PUNPCKHDQ  ( MA2, MA2 ))                    /*    pa2    |    pa2    |    pa2    |    pa2    */
236
 
237
#define GMB_PACK( MS1, MS2 ) \
238
    PACKUSWB   ( MS2, MS1 )			/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */	;\
239
 
240
#define GMB_STORE(rgba, MSS ) \
241
ONE(MOVD       ( MSS, REGIND(rgba) ))		/*     |     |     |     | sa1 | sb1 | sg1 | sr1 */	;\
242
TWO(MOVQ       ( MSS, REGIND(rgba) ))		/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */
243
 
244
/* Kevin F. Quinn  2 July 2006
245
 * Replace data segment constants with text-segment
246
 * constants (via pushl/movq)
247
    SEG_DATA
248
 
249
ALIGNDATA8
250
const_0080:
251
    D_LONG 0x00800080, 0x00800080
252
 
253
const_80:
254
    D_LONG 0x80808080, 0x80808080
255
*/
256
#define const_0080_l 0x00800080
257
#define const_0080_h 0x00800080
258
#define const_80_l 0x80808080
259
#define const_80_h 0x80808080
260
 
261
    SEG_TEXT
262
 
263
 
264
/* Blend transparency function
265
 */
266
 
267
#define TAG(x) CONCAT(x,_transparency)
268
#define LLTAG(x) LLBL2(x,_transparency)
269
 
270
#define INIT \
271
    PXOR       ( MM0, MM0 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
272
 
273
#define MAIN( rgba, dest ) \
274
    GMB_LOAD( rgba, dest, MM1, MM2 )									;\
275
    GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 )								;\
276
    GMB_ALPHA( MM1, MM3, MM4, MM6 )									;\
277
    GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 )							;\
278
    GMB_PACK( MM3, MM6 )										;\
279
    GMB_STORE( rgba, MM3 )
280
 
281
#include "mmx_blendtmp.h"
282
 
283
 
284
/* Blend add function
285
 *
286
 * FIXME: Add some loop unrolling here...
287
 */
288
 
289
#define TAG(x) CONCAT(x,_add)
290
#define LLTAG(x) LLBL2(x,_add)
291
 
292
#define INIT
293
 
294
#define MAIN( rgba, dest ) \
295
ONE(MOVD       ( REGIND(rgba), MM1 ))		/*     |     |     |     | qa1 | qb1 | qg1 | qr1 */	;\
296
ONE(MOVD       ( REGIND(dest), MM2 ))		/*     |     |     |     | pa1 | pb1 | pg1 | pr1 */	;\
297
ONE(PADDUSB    ( MM2, MM1 ))										;\
298
ONE(MOVD       ( MM1, REGIND(rgba) ))		/*     |     |     |     | sa1 | sb1 | sg1 | sr1 */	;\
299
													;\
300
TWO(MOVQ       ( REGIND(rgba), MM1 ))		/* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */	;\
301
TWO(PADDUSB    ( REGIND(dest), MM1 ))		/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */	;\
302
TWO(MOVQ       ( MM1, REGIND(rgba) ))
303
 
304
#include "mmx_blendtmp.h"
305
 
306
 
307
/* Blend min function
308
 */
309
 
310
#define TAG(x) CONCAT(x,_min)
311
#define LLTAG(x) LLBL2(x,_min)
312
 
313
/* Kevin F. Quinn 2nd July 2006
314
 * Replace data segment constants with text-segment instructions
315
#define INIT \
316
    MOVQ       ( CONTENT(const_80), MM7 )
317
 */
318
#define INIT \
319
    PUSH_L     ( CONST(const_80_h) ) 		/* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/	;\
320
    PUSH_L     ( CONST(const_80_l) ) 									;\
321
    MOVQ       ( REGIND(ESP), MM7 ) 									;\
322
    ADD_L      ( CONST(8), ESP)
323
 
324
#define MAIN( rgba, dest ) \
325
    GMB_LOAD( rgba, dest, MM1, MM2 )									;\
326
    MOVQ       ( MM1, MM3 )										;\
327
    MOVQ       ( MM2, MM4 )										;\
328
    PXOR       ( MM7, MM3 )			/*              unsigned -> signed               */	;\
329
    PXOR       ( MM7, MM4 )			/*              unsigned -> signed               */	;\
330
    PCMPGTB    ( MM3, MM4 )			/*                 q > p ? 0xff : 0x00           */	;\
331
    PAND       ( MM4, MM1 )			/*                 q > p ? p : 0                 */	;\
332
    PANDN      ( MM2, MM4 )			/*                 q > p ? 0 : q                 */	;\
333
    POR        ( MM1, MM4 )			/*                 q > p ? p : q                 */	;\
334
    GMB_STORE( rgba, MM4 )
335
 
336
#include "mmx_blendtmp.h"
337
 
338
 
339
/* Blend max function
340
 */
341
 
342
#define TAG(x) CONCAT(x,_max)
343
#define LLTAG(x) LLBL2(x,_max)
344
 
345
/* Kevin F. Quinn 2nd July 2006
346
 * Replace data segment constants with text-segment instructions
347
#define INIT \
348
    MOVQ       ( CONTENT(const_80), MM7 )
349
 */
350
#define INIT \
351
    PUSH_L     ( CONST(const_80_l) ) 		/* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/	;\
352
    PUSH_L     ( CONST(const_80_h) ) 									;\
353
    MOVQ       ( REGIND(ESP), MM7 ) 									;\
354
    ADD_L      ( CONST(8), ESP)
355
 
356
#define MAIN( rgba, dest ) \
357
    GMB_LOAD( rgba, dest, MM1, MM2 )									;\
358
    MOVQ       ( MM1, MM3 )										;\
359
    MOVQ       ( MM2, MM4 )										;\
360
    PXOR       ( MM7, MM3 )			/*              unsigned -> signed               */	;\
361
    PXOR       ( MM7, MM4 )			/*              unsigned -> signed               */	;\
362
    PCMPGTB    ( MM3, MM4 )			/*                 q > p ? 0xff : 0x00           */	;\
363
    PAND       ( MM4, MM2 )			/*                 q > p ? q : 0                 */	;\
364
    PANDN      ( MM1, MM4 )			/*                 q > p ? 0 : p                 */	;\
365
    POR        ( MM2, MM4 )			/*                 q > p ? p : q                 */	;\
366
    GMB_STORE( rgba, MM4 )
367
 
368
#include "mmx_blendtmp.h"
369
 
370
 
371
/* Blend modulate function
372
 */
373
 
374
#define TAG(x) CONCAT(x,_modulate)
375
#define LLTAG(x) LLBL2(x,_modulate)
376
 
377
/* Kevin F. Quinn 2nd July 2006
378
 * Replace data segment constants with text-segment instructions
379
#define INIT \
380
    MOVQ       ( CONTENT(const_0080), MM7 )
381
 */
382
#define INIT \
383
    PXOR       ( MM0, MM0 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */	;\
384
    PUSH_L     ( CONST(const_0080_l) ) 	/*   0x0080  |   0x0080  |   0x0080  |   0x0080  */	;\
385
    PUSH_L     ( CONST(const_0080_h) ) 								;\
386
    MOVQ       ( REGIND(ESP), MM7 ) 									;\
387
    ADD_L      ( CONST(8), ESP)
388
 
389
#define MAIN( rgba, dest ) \
390
    GMB_LOAD( rgba, dest, MM1, MM2 )									;\
391
    GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 )								;\
392
    GMB_MULT_GSR( MM1, MM2, MM4, MM5, MM7 )								;\
393
    GMB_PACK( MM2, MM5 )										;\
394
    GMB_STORE( rgba, MM2 )
395
 
396
#include "mmx_blendtmp.h"
397
 
398
#endif
399
 
400
#if defined (__ELF__) && defined (__linux__)
401
	.section .note.GNU-stack,"",%progbits
402
#endif