Subversion Repositories Kolibri OS

Rev

Rev 8210 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
8210 maxcodehac 1
;
2
; pII-optimised MMX format converters for HERMES
3
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
4
;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
5
; This source code is licensed under the GNU LGPL
6
;
7
; Please refer to the file COPYING.LIB contained in the distribution for
8
; licensing conditions
9
;
10
; COPYRIGHT NOTICE
11
;
12
; This file partly contains code that is (c) Intel Corporation, specifically
13
; the mode detection routine, and the converter to 15 bit (8 pixel
14
; conversion routine from the mmx programming tutorial pages).
15
;
16
;
17
; These routines aren't exactly pII optimised - it's just that as they
18
; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
19
; optimise them for p5 MMXs..
20
 
21
BITS 32
22
 
9172 turbocat 23
%include "common.inc"
8210 maxcodehac 24
 
9172 turbocat 25
SDL_FUNC _ConvertMMXpII32_24RGB888
26
SDL_FUNC _ConvertMMXpII32_16RGB565
27
SDL_FUNC _ConvertMMXpII32_16BGR565
28
SDL_FUNC _ConvertMMXpII32_16RGB555
29
SDL_FUNC _ConvertMMXpII32_16BGR555
8210 maxcodehac 30
 
9172 turbocat 31
;; Macros for conversion routines
8210 maxcodehac 32
 
9172 turbocat 33
%macro _push_immq_mask 1
34
	push dword %1
35
	push dword %1
36
%endmacro
8210 maxcodehac 37
 
9172 turbocat 38
%macro load_immq 2
39
	_push_immq_mask %2
40
	movq %1, [esp]
41
%endmacro
8210 maxcodehac 42
 
9172 turbocat 43
%macro pand_immq 2
44
	_push_immq_mask %2
45
	pand %1, [esp]
46
%endmacro
8210 maxcodehac 47
 
9172 turbocat 48
%define CLEANUP_IMMQ_LOADS(num) \
49
	add esp, byte 8 * num
8210 maxcodehac 50
 
9172 turbocat 51
%define mmx32_rgb888_mask 00ffffffh
52
%define mmx32_rgb565_b 000000f8h
53
%define mmx32_rgb565_g 0000fc00h
54
%define mmx32_rgb565_r 00f80000h
8210 maxcodehac 55
 
9172 turbocat 56
%define mmx32_rgb555_rb 00f800f8h
57
%define mmx32_rgb555_g 0000f800h
58
%define mmx32_rgb555_mul 20000008h
59
%define mmx32_bgr555_mul 00082000h
60
 
8210 maxcodehac 61
SECTION .text
62
 
63
_ConvertMMXpII32_24RGB888:
64
 
65
        ; set up mm6 as the mask, mm7 as zero
9172 turbocat 66
        load_immq mm6, mmx32_rgb888_mask
67
        CLEANUP_IMMQ_LOADS(1)
8210 maxcodehac 68
        pxor mm7, mm7
69
 
70
        mov edx, ecx                    ; save ecx
71
        and ecx, 0fffffffch             ; clear lower two bits
72
        jnz .L1
73
        jmp .L2
74
 
75
.L1:
76
 
77
        movq mm0, [esi]                 ; A R G B a r g b
78
        pand mm0, mm6                   ; 0 R G B 0 r g b
79
        movq mm1, [esi+8]               ; A R G B a r g b
80
        pand mm1, mm6                   ; 0 R G B 0 r g b
81
 
82
        movq mm2, mm0                   ; 0 R G B 0 r g b
83
        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
84
        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
85
        psllq mm2, 24                   ; 0 0 R G B 0 0 0
86
        por mm0, mm2                    ; 0 0 R G B r g b
87
 
88
        movq mm3, mm1                   ; 0 R G B 0 r g b
89
        psllq mm3, 48                   ; g b 0 0 0 0 0 0
90
        por mm0, mm3                    ; g b R G B r g b
91
 
92
        movq mm4, mm1                   ; 0 R G B 0 r g b
93
        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
94
        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
95
        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
96
        psllq mm4, 8                    ; 0 0 0 0 R G B 0
97
        por mm1, mm4                    ; 0 0 0 0 R G B r
98
 
99
        movq [edi], mm0
100
        add esi, BYTE 16
101
        movd [edi+8], mm1
102
        add edi, BYTE 12
103
        sub ecx, BYTE 4
104
        jnz .L1
105
 
106
.L2:
107
        mov ecx, edx
108
        and ecx, BYTE 3
109
        jz .L4
110
.L3:
111
        mov al, [esi]
112
        mov bl, [esi+1]
113
        mov dl, [esi+2]
114
        mov [edi], al
115
        mov [edi+1], bl
116
        mov [edi+2], dl
117
        add esi, BYTE 4
118
        add edi, BYTE 3
119
        dec ecx
120
        jnz .L3
121
.L4:
9172 turbocat 122
        retn
8210 maxcodehac 123
 
124
 
125
 
126
_ConvertMMXpII32_16RGB565:
127
 
128
        ; set up masks
9172 turbocat 129
        load_immq mm5, mmx32_rgb565_b
130
        load_immq mm6, mmx32_rgb565_g
131
        load_immq mm7, mmx32_rgb565_r
132
        CLEANUP_IMMQ_LOADS(3)
8210 maxcodehac 133
 
134
        mov edx, ecx
135
        shr ecx, 2
136
        jnz .L1
137
        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
138
 
139
.L1:
140
        movq mm0, [esi]         ; argb
141
        movq mm1, mm0           ; argb
142
        pand mm0, mm6           ; 00g0
143
        movq mm3, mm1           ; argb
144
        pand mm1, mm5           ; 000b
145
        pand mm3, mm7           ; 0r00
146
        pslld mm1, 2            ; 0 0 000000bb bbb00000
147
        por mm0, mm1            ; 0 0 ggggggbb bbb00000
148
        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
149
 
150
        movq mm4, [esi+8]       ; argb
151
        movq mm2, mm4           ; argb
152
        pand mm4, mm6           ; 00g0
153
        movq mm1, mm2           ; argb
154
        pand mm2, mm5           ; 000b
155
        pand mm1, mm7           ; 0r00
156
        pslld mm2, 2            ; 0 0 000000bb bbb00000
157
        por mm4, mm2            ; 0 0 ggggggbb bbb00000
158
        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
159
 
160
        packuswb mm3, mm1       ; R 0 r 0
161
        packssdw mm0, mm4       ; as above.. ish
162
        por mm0, mm3            ; done.
163
        movq [edi], mm0
164
 
165
        add esi, 16
166
        add edi, 8
167
        dec ecx
168
        jnz .L1
169
 
170
.L2:
171
        mov ecx, edx
172
        and ecx, BYTE 3
173
        jz .L4
174
.L3:
175
        mov al, [esi]
176
        mov bh, [esi+1]
177
        mov ah, [esi+2]
178
        shr al, 3
179
        and eax, 0F81Fh            ; BYTE?
180
        shr ebx, 5
181
        and ebx, 07E0h             ; BYTE?
182
        add eax, ebx
183
        mov [edi], al
184
        mov [edi+1], ah
185
        add esi, BYTE 4
186
        add edi, BYTE 2
187
        dec ecx
188
        jnz .L3
189
 
190
.L4:
9172 turbocat 191
	retn
8210 maxcodehac 192
 
193
 
194
_ConvertMMXpII32_16BGR565:
195
 
9172 turbocat 196
        load_immq mm5, mmx32_rgb565_r
197
        load_immq mm6, mmx32_rgb565_g
198
        load_immq mm7, mmx32_rgb565_b
199
        CLEANUP_IMMQ_LOADS(3)
8210 maxcodehac 200
 
201
        mov edx, ecx
202
        shr ecx, 2
203
        jnz .L1
204
        jmp .L2
205
 
206
.L1:
207
        movq mm0, [esi]                 ; a r g b
208
        movq mm1, mm0                   ; a r g b
209
        pand mm0, mm6                   ; 0 0 g 0
210
        movq mm3, mm1                   ; a r g b
211
        pand mm1, mm5                   ; 0 r 0 0
212
        pand mm3, mm7                   ; 0 0 0 b
213
 
214
        psllq mm3, 16                   ; 0 b 0 0
215
        psrld mm1, 14                   ; 0 0 000000rr rrr00000
216
        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
217
        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
218
 
219
        movq mm4, [esi+8]               ; a r g b
220
        movq mm2, mm4                   ; a r g b
221
        pand mm4, mm6                   ; 0 0 g 0
222
        movq mm1, mm2                   ; a r g b
223
        pand mm2, mm5                   ; 0 r 0 0
224
        pand mm1, mm7                   ; 0 0 0 b
225
 
226
        psllq mm1, 16                   ; 0 b 0 0
227
        psrld mm2, 14                   ; 0 0 000000rr rrr00000
228
        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
229
        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
230
 
231
        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
232
        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
233
        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
234
        movq [edi], mm0
235
 
236
        add esi, BYTE 16
237
        add edi, BYTE 8
238
        dec ecx
239
        jnz .L1
240
 
241
.L2:
242
        and edx, BYTE 3
243
        jz .L4
244
.L3:
245
        mov al, [esi+2]
246
        mov bh, [esi+1]
247
        mov ah, [esi]
248
        shr al, 3
249
        and eax, 0F81Fh                    ; BYTE ?
250
        shr ebx, 5
251
        and ebx, 07E0h                     ; BYTE ?
252
        add eax, ebx
253
        mov [edi], al
254
        mov [edi+1], ah
255
        add esi, BYTE 4
256
        add edi, BYTE 2
257
        dec edx
258
        jnz .L3
259
 
260
.L4:
9172 turbocat 261
        retn
8210 maxcodehac 262
 
263
_ConvertMMXpII32_16BGR555:
264
 
265
        ; the 16BGR555 converter is identical to the RGB555 one,
266
        ; except it uses a different multiplier for the pmaddwd
267
        ; instruction.  cool huh.
268
 
9172 turbocat 269
        load_immq mm7, mmx32_bgr555_mul
8210 maxcodehac 270
        jmp _convert_bgr555_cheat
271
 
272
; This is the same as the Intel version.. they obviously went to
273
; much more trouble to expand/coil the loop than I did, so theirs
274
; would almost certainly be faster, even if only a little.
275
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
276
; (I think) a more accurate name..
277
_ConvertMMXpII32_16RGB555:
278
 
9172 turbocat 279
	load_immq mm7, mmx32_rgb555_mul
8210 maxcodehac 280
_convert_bgr555_cheat:
9172 turbocat 281
	load_immq mm6, mmx32_rgb555_g
282
	CLEANUP_IMMQ_LOADS(2)
8210 maxcodehac 283
 
284
	mov edx,ecx		           ; Save ecx
285
 
9172 turbocat 286
        and ecx,DWORD 0fffffff8h            ; clear lower three bits
8210 maxcodehac 287
	jnz .L_OK
9172 turbocat 288
        jmp near .L2
8210 maxcodehac 289
 
290
.L_OK:
291
 
292
	movq mm2,[esi+8]
293
 
294
	movq mm0,[esi]
295
	movq mm3,mm2
296
 
9172 turbocat 297
	pand_immq mm3, mmx32_rgb555_rb
8210 maxcodehac 298
	movq mm1,mm0
299
 
9172 turbocat 300
	pand_immq mm1, mmx32_rgb555_rb
8210 maxcodehac 301
	pmaddwd mm3,mm7
302
 
9172 turbocat 303
	CLEANUP_IMMQ_LOADS(2)
304
 
8210 maxcodehac 305
	pmaddwd mm1,mm7
306
	pand mm2,mm6
307
 
308
.L1:
309
	movq mm4,[esi+24]
310
	pand mm0,mm6
311
 
312
	movq mm5,[esi+16]
313
	por mm3,mm2
314
 
315
	psrld mm3,6
316
	por mm1,mm0
317
 
318
	movq mm0,mm4
319
	psrld mm1,6
320
 
9172 turbocat 321
	pand_immq mm0, mmx32_rgb555_rb
8210 maxcodehac 322
	packssdw mm1,mm3
323
 
324
	movq mm3,mm5
325
	pmaddwd mm0,mm7
326
 
9172 turbocat 327
	pand_immq mm3, mmx32_rgb555_rb
8210 maxcodehac 328
	pand mm4,mm6
329
 
330
	movq [edi],mm1
331
	pmaddwd mm3,mm7
332
 
333
        add esi,BYTE 32
334
	por mm4,mm0
335
 
336
	pand mm5,mm6
337
	psrld mm4,6
338
 
339
	movq mm2,[esi+8]
340
	por mm5,mm3
341
 
342
	movq mm0,[esi]
343
	psrld mm5,6
344
 
345
	movq mm3,mm2
346
	movq mm1,mm0
347
 
9172 turbocat 348
	pand_immq mm3, mmx32_rgb555_rb
8210 maxcodehac 349
	packssdw mm5,mm4
350
 
9172 turbocat 351
	pand_immq mm1, mmx32_rgb555_rb
8210 maxcodehac 352
	pand mm2,mm6
353
 
9172 turbocat 354
	CLEANUP_IMMQ_LOADS(4)
355
 
8210 maxcodehac 356
	movq [edi+8],mm5
357
	pmaddwd mm3,mm7
358
 
359
	pmaddwd mm1,mm7
360
        add edi,BYTE 16
361
 
362
        sub ecx,BYTE 8
363
	jz .L2
364
        jmp .L1
365
 
366
 
367
.L2:
368
	mov ecx,edx
369
 
370
        and ecx,BYTE 7
371
	jz .L4
372
 
373
.L3:
374
	mov ebx,[esi]
375
        add esi,BYTE 4
376
 
377
        mov eax,ebx
378
        mov edx,ebx
379
 
380
        shr eax,3
381
        shr edx,6
382
 
383
        and eax,BYTE 0000000000011111b
384
        and edx,     0000001111100000b
385
 
386
        shr ebx,9
387
 
388
        or eax,edx
389
 
390
        and ebx,     0111110000000000b
391
 
392
        or eax,ebx
393
 
394
        mov [edi],ax
395
        add edi,BYTE 2
396
 
397
	dec ecx
398
	jnz .L3
399
 
400
.L4:
9172 turbocat 401
	retn
8210 maxcodehac 402
 
9172 turbocat 403
%ifidn __OUTPUT_FORMAT__,elf32
404
section .note.GNU-stack noalloc noexec nowrite progbits
405
%endif