Subversion Repositories Kolibri OS

Rev

Rev 9172 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
8210 maxcodehac 1
;
2
; pII-optimised MMX format converters for HERMES
3
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
4
;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
5
; This source code is licensed under the GNU LGPL
6
;
7
; Please refer to the file COPYING.LIB contained in the distribution for
8
; licensing conditions
9
;
10
; COPYRIGHT NOTICE
11
;
12
; This file partly contains code that is (c) Intel Corporation, specifically
13
; the mode detection routine, and the converter to 15 bit (8 pixel
14
; conversion routine from the mmx programming tutorial pages).
15
;
16
;
17
; These routines aren't exactly pII optimised - it's just that as they
18
; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
19
; optimise them for p5 MMXs..
20
 
21
BITS 32
22
 
9172 turbocat 23
%include "common.inc"
8210 maxcodehac 24
 
9172 turbocat 25
SDL_FUNC _ConvertMMXpII32_24RGB888
26
SDL_FUNC _ConvertMMXpII32_16RGB565
27
SDL_FUNC _ConvertMMXpII32_16BGR565
28
SDL_FUNC _ConvertMMXpII32_16RGB555
29
SDL_FUNC _ConvertMMXpII32_16BGR555
8210 maxcodehac 30
 
9202 turbocat 31
SDL_FUNC ConvertMMXpII32_24RGB888
32
SDL_FUNC ConvertMMXpII32_16RGB565
33
SDL_FUNC ConvertMMXpII32_16BGR565
34
SDL_FUNC ConvertMMXpII32_16RGB555
35
SDL_FUNC ConvertMMXpII32_16BGR555
36
 
37
 
9172 turbocat 38
;; Macros for conversion routines
8210 maxcodehac 39
 
9172 turbocat 40
%macro _push_immq_mask 1
41
	push dword %1
42
	push dword %1
43
%endmacro
8210 maxcodehac 44
 
9172 turbocat 45
%macro load_immq 2
46
	_push_immq_mask %2
47
	movq %1, [esp]
48
%endmacro
8210 maxcodehac 49
 
9172 turbocat 50
%macro pand_immq 2
51
	_push_immq_mask %2
52
	pand %1, [esp]
53
%endmacro
8210 maxcodehac 54
 
9172 turbocat 55
%define CLEANUP_IMMQ_LOADS(num) \
56
	add esp, byte 8 * num
8210 maxcodehac 57
 
9172 turbocat 58
%define mmx32_rgb888_mask 00ffffffh
59
%define mmx32_rgb565_b 000000f8h
60
%define mmx32_rgb565_g 0000fc00h
61
%define mmx32_rgb565_r 00f80000h
8210 maxcodehac 62
 
9172 turbocat 63
%define mmx32_rgb555_rb 00f800f8h
64
%define mmx32_rgb555_g 0000f800h
65
%define mmx32_rgb555_mul 20000008h
66
%define mmx32_bgr555_mul 00082000h
67
 
8210 maxcodehac 68
SECTION .text
69
 
9202 turbocat 70
ConvertMMXpII32_24RGB888:
8210 maxcodehac 71
_ConvertMMXpII32_24RGB888:
72
 
73
        ; set up mm6 as the mask, mm7 as zero
9172 turbocat 74
        load_immq mm6, mmx32_rgb888_mask
75
        CLEANUP_IMMQ_LOADS(1)
8210 maxcodehac 76
        pxor mm7, mm7
77
 
78
        mov edx, ecx                    ; save ecx
79
        and ecx, 0fffffffch             ; clear lower two bits
80
        jnz .L1
81
        jmp .L2
82
 
83
.L1:
84
 
85
        movq mm0, [esi]                 ; A R G B a r g b
86
        pand mm0, mm6                   ; 0 R G B 0 r g b
87
        movq mm1, [esi+8]               ; A R G B a r g b
88
        pand mm1, mm6                   ; 0 R G B 0 r g b
89
 
90
        movq mm2, mm0                   ; 0 R G B 0 r g b
91
        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
92
        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
93
        psllq mm2, 24                   ; 0 0 R G B 0 0 0
94
        por mm0, mm2                    ; 0 0 R G B r g b
95
 
96
        movq mm3, mm1                   ; 0 R G B 0 r g b
97
        psllq mm3, 48                   ; g b 0 0 0 0 0 0
98
        por mm0, mm3                    ; g b R G B r g b
99
 
100
        movq mm4, mm1                   ; 0 R G B 0 r g b
101
        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
102
        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
103
        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
104
        psllq mm4, 8                    ; 0 0 0 0 R G B 0
105
        por mm1, mm4                    ; 0 0 0 0 R G B r
106
 
107
        movq [edi], mm0
108
        add esi, BYTE 16
109
        movd [edi+8], mm1
110
        add edi, BYTE 12
111
        sub ecx, BYTE 4
112
        jnz .L1
113
 
114
.L2:
115
        mov ecx, edx
116
        and ecx, BYTE 3
117
        jz .L4
118
.L3:
119
        mov al, [esi]
120
        mov bl, [esi+1]
121
        mov dl, [esi+2]
122
        mov [edi], al
123
        mov [edi+1], bl
124
        mov [edi+2], dl
125
        add esi, BYTE 4
126
        add edi, BYTE 3
127
        dec ecx
128
        jnz .L3
129
.L4:
9172 turbocat 130
        retn
8210 maxcodehac 131
 
132
 
9202 turbocat 133
ConvertMMXpII32_16RGB565:
8210 maxcodehac 134
_ConvertMMXpII32_16RGB565:
135
 
136
        ; set up masks
9172 turbocat 137
        load_immq mm5, mmx32_rgb565_b
138
        load_immq mm6, mmx32_rgb565_g
139
        load_immq mm7, mmx32_rgb565_r
140
        CLEANUP_IMMQ_LOADS(3)
8210 maxcodehac 141
 
142
        mov edx, ecx
143
        shr ecx, 2
144
        jnz .L1
145
        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
146
 
147
.L1:
148
        movq mm0, [esi]         ; argb
149
        movq mm1, mm0           ; argb
150
        pand mm0, mm6           ; 00g0
151
        movq mm3, mm1           ; argb
152
        pand mm1, mm5           ; 000b
153
        pand mm3, mm7           ; 0r00
154
        pslld mm1, 2            ; 0 0 000000bb bbb00000
155
        por mm0, mm1            ; 0 0 ggggggbb bbb00000
156
        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
157
 
158
        movq mm4, [esi+8]       ; argb
159
        movq mm2, mm4           ; argb
160
        pand mm4, mm6           ; 00g0
161
        movq mm1, mm2           ; argb
162
        pand mm2, mm5           ; 000b
163
        pand mm1, mm7           ; 0r00
164
        pslld mm2, 2            ; 0 0 000000bb bbb00000
165
        por mm4, mm2            ; 0 0 ggggggbb bbb00000
166
        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
167
 
168
        packuswb mm3, mm1       ; R 0 r 0
169
        packssdw mm0, mm4       ; as above.. ish
170
        por mm0, mm3            ; done.
171
        movq [edi], mm0
172
 
173
        add esi, 16
174
        add edi, 8
175
        dec ecx
176
        jnz .L1
177
 
178
.L2:
179
        mov ecx, edx
180
        and ecx, BYTE 3
181
        jz .L4
182
.L3:
183
        mov al, [esi]
184
        mov bh, [esi+1]
185
        mov ah, [esi+2]
186
        shr al, 3
187
        and eax, 0F81Fh            ; BYTE?
188
        shr ebx, 5
189
        and ebx, 07E0h             ; BYTE?
190
        add eax, ebx
191
        mov [edi], al
192
        mov [edi+1], ah
193
        add esi, BYTE 4
194
        add edi, BYTE 2
195
        dec ecx
196
        jnz .L3
197
 
198
.L4:
9172 turbocat 199
	retn
8210 maxcodehac 200
 
9202 turbocat 201
ConvertMMXpII32_16BGR565:
8210 maxcodehac 202
_ConvertMMXpII32_16BGR565:
203
 
9172 turbocat 204
        load_immq mm5, mmx32_rgb565_r
205
        load_immq mm6, mmx32_rgb565_g
206
        load_immq mm7, mmx32_rgb565_b
207
        CLEANUP_IMMQ_LOADS(3)
8210 maxcodehac 208
 
209
        mov edx, ecx
210
        shr ecx, 2
211
        jnz .L1
212
        jmp .L2
213
 
214
.L1:
215
        movq mm0, [esi]                 ; a r g b
216
        movq mm1, mm0                   ; a r g b
217
        pand mm0, mm6                   ; 0 0 g 0
218
        movq mm3, mm1                   ; a r g b
219
        pand mm1, mm5                   ; 0 r 0 0
220
        pand mm3, mm7                   ; 0 0 0 b
221
 
222
        psllq mm3, 16                   ; 0 b 0 0
223
        psrld mm1, 14                   ; 0 0 000000rr rrr00000
224
        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
225
        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
226
 
227
        movq mm4, [esi+8]               ; a r g b
228
        movq mm2, mm4                   ; a r g b
229
        pand mm4, mm6                   ; 0 0 g 0
230
        movq mm1, mm2                   ; a r g b
231
        pand mm2, mm5                   ; 0 r 0 0
232
        pand mm1, mm7                   ; 0 0 0 b
233
 
234
        psllq mm1, 16                   ; 0 b 0 0
235
        psrld mm2, 14                   ; 0 0 000000rr rrr00000
236
        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
237
        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
238
 
239
        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
240
        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
241
        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
242
        movq [edi], mm0
243
 
244
        add esi, BYTE 16
245
        add edi, BYTE 8
246
        dec ecx
247
        jnz .L1
248
 
249
.L2:
250
        and edx, BYTE 3
251
        jz .L4
252
.L3:
253
        mov al, [esi+2]
254
        mov bh, [esi+1]
255
        mov ah, [esi]
256
        shr al, 3
257
        and eax, 0F81Fh                    ; BYTE ?
258
        shr ebx, 5
259
        and ebx, 07E0h                     ; BYTE ?
260
        add eax, ebx
261
        mov [edi], al
262
        mov [edi+1], ah
263
        add esi, BYTE 4
264
        add edi, BYTE 2
265
        dec edx
266
        jnz .L3
267
 
268
.L4:
9172 turbocat 269
        retn
8210 maxcodehac 270
 
9202 turbocat 271
ConvertMMXpII32_16BGR555:
8210 maxcodehac 272
_ConvertMMXpII32_16BGR555:
273
 
274
        ; the 16BGR555 converter is identical to the RGB555 one,
275
        ; except it uses a different multiplier for the pmaddwd
276
        ; instruction.  cool huh.
277
 
9172 turbocat 278
        load_immq mm7, mmx32_bgr555_mul
8210 maxcodehac 279
        jmp _convert_bgr555_cheat
280
 
281
; This is the same as the Intel version.. they obviously went to
282
; much more trouble to expand/coil the loop than I did, so theirs
283
; would almost certainly be faster, even if only a little.
284
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
285
; (I think) a more accurate name..
9202 turbocat 286
 
287
ConvertMMXpII32_16RGB555:
8210 maxcodehac 288
_ConvertMMXpII32_16RGB555:
289
 
9172 turbocat 290
	load_immq mm7, mmx32_rgb555_mul
8210 maxcodehac 291
_convert_bgr555_cheat:
9172 turbocat 292
	load_immq mm6, mmx32_rgb555_g
293
	CLEANUP_IMMQ_LOADS(2)
8210 maxcodehac 294
 
295
	mov edx,ecx		           ; Save ecx
296
 
9172 turbocat 297
        and ecx,DWORD 0fffffff8h            ; clear lower three bits
8210 maxcodehac 298
	jnz .L_OK
9172 turbocat 299
        jmp near .L2
8210 maxcodehac 300
 
301
.L_OK:
302
 
303
	movq mm2,[esi+8]
304
 
305
	movq mm0,[esi]
306
	movq mm3,mm2
307
 
9172 turbocat 308
	pand_immq mm3, mmx32_rgb555_rb
8210 maxcodehac 309
	movq mm1,mm0
310
 
9172 turbocat 311
	pand_immq mm1, mmx32_rgb555_rb
8210 maxcodehac 312
	pmaddwd mm3,mm7
313
 
9172 turbocat 314
	CLEANUP_IMMQ_LOADS(2)
315
 
8210 maxcodehac 316
	pmaddwd mm1,mm7
317
	pand mm2,mm6
318
 
319
.L1:
320
	movq mm4,[esi+24]
321
	pand mm0,mm6
322
 
323
	movq mm5,[esi+16]
324
	por mm3,mm2
325
 
326
	psrld mm3,6
327
	por mm1,mm0
328
 
329
	movq mm0,mm4
330
	psrld mm1,6
331
 
9172 turbocat 332
	pand_immq mm0, mmx32_rgb555_rb
8210 maxcodehac 333
	packssdw mm1,mm3
334
 
335
	movq mm3,mm5
336
	pmaddwd mm0,mm7
337
 
9172 turbocat 338
	pand_immq mm3, mmx32_rgb555_rb
8210 maxcodehac 339
	pand mm4,mm6
340
 
341
	movq [edi],mm1
342
	pmaddwd mm3,mm7
343
 
344
        add esi,BYTE 32
345
	por mm4,mm0
346
 
347
	pand mm5,mm6
348
	psrld mm4,6
349
 
350
	movq mm2,[esi+8]
351
	por mm5,mm3
352
 
353
	movq mm0,[esi]
354
	psrld mm5,6
355
 
356
	movq mm3,mm2
357
	movq mm1,mm0
358
 
9172 turbocat 359
	pand_immq mm3, mmx32_rgb555_rb
8210 maxcodehac 360
	packssdw mm5,mm4
361
 
9172 turbocat 362
	pand_immq mm1, mmx32_rgb555_rb
8210 maxcodehac 363
	pand mm2,mm6
364
 
9172 turbocat 365
	CLEANUP_IMMQ_LOADS(4)
366
 
8210 maxcodehac 367
	movq [edi+8],mm5
368
	pmaddwd mm3,mm7
369
 
370
	pmaddwd mm1,mm7
371
        add edi,BYTE 16
372
 
373
        sub ecx,BYTE 8
374
	jz .L2
375
        jmp .L1
376
 
377
 
378
.L2:
379
	mov ecx,edx
380
 
381
        and ecx,BYTE 7
382
	jz .L4
383
 
384
.L3:
385
	mov ebx,[esi]
386
        add esi,BYTE 4
387
 
388
        mov eax,ebx
389
        mov edx,ebx
390
 
391
        shr eax,3
392
        shr edx,6
393
 
394
        and eax,BYTE 0000000000011111b
395
        and edx,     0000001111100000b
396
 
397
        shr ebx,9
398
 
399
        or eax,edx
400
 
401
        and ebx,     0111110000000000b
402
 
403
        or eax,ebx
404
 
405
        mov [edi],ax
406
        add edi,BYTE 2
407
 
408
	dec ecx
409
	jnz .L3
410
 
411
.L4:
9172 turbocat 412
	retn
8210 maxcodehac 413
 
9172 turbocat 414
%ifidn __OUTPUT_FORMAT__,elf32
415
section .note.GNU-stack noalloc noexec nowrite progbits
416
%endif