Rev 8210 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
8210 | maxcodehac | 1 | ; |
2 | ; pII-optimised MMX format converters for HERMES |
||
3 | ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) |
||
4 | ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) |
||
5 | ; This source code is licensed under the GNU LGPL |
||
6 | ; |
||
7 | ; Please refer to the file COPYING.LIB contained in the distribution for |
||
8 | ; licensing conditions |
||
9 | ; |
||
10 | ; COPYRIGHT NOTICE |
||
11 | ; |
||
12 | ; This file partly contains code that is (c) Intel Corporation, specifically |
||
13 | ; the mode detection routine, and the converter to 15 bit (8 pixel |
||
14 | ; conversion routine from the mmx programming tutorial pages). |
||
15 | ; |
||
16 | ; |
||
17 | ; These routines aren't exactly pII optimised - it's just that as they |
||
18 | ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to |
||
19 | ; optimise them for p5 MMXs.. |
||
20 | |||
21 | BITS 32 |
||
22 | |||
9172 | turbocat | 23 | %include "common.inc" |
8210 | maxcodehac | 24 | |
9172 | turbocat | 25 | SDL_FUNC _ConvertMMXpII32_24RGB888 |
26 | SDL_FUNC _ConvertMMXpII32_16RGB565 |
||
27 | SDL_FUNC _ConvertMMXpII32_16BGR565 |
||
28 | SDL_FUNC _ConvertMMXpII32_16RGB555 |
||
29 | SDL_FUNC _ConvertMMXpII32_16BGR555 |
||
8210 | maxcodehac | 30 | |
9172 | turbocat | 31 | ;; Macros for conversion routines |
8210 | maxcodehac | 32 | |
9172 | turbocat | 33 | %macro _push_immq_mask 1 |
34 | push dword %1 |
||
35 | push dword %1 |
||
36 | %endmacro |
||
8210 | maxcodehac | 37 | |
9172 | turbocat | 38 | %macro load_immq 2 |
39 | _push_immq_mask %2 |
||
40 | movq %1, [esp] |
||
41 | %endmacro |
||
8210 | maxcodehac | 42 | |
9172 | turbocat | 43 | %macro pand_immq 2 |
44 | _push_immq_mask %2 |
||
45 | pand %1, [esp] |
||
46 | %endmacro |
||
8210 | maxcodehac | 47 | |
9172 | turbocat | 48 | %define CLEANUP_IMMQ_LOADS(num) \ |
49 | add esp, byte 8 * num |
||
8210 | maxcodehac | 50 | |
9172 | turbocat | 51 | %define mmx32_rgb888_mask 00ffffffh |
52 | %define mmx32_rgb565_b 000000f8h |
||
53 | %define mmx32_rgb565_g 0000fc00h |
||
54 | %define mmx32_rgb565_r 00f80000h |
||
8210 | maxcodehac | 55 | |
9172 | turbocat | 56 | %define mmx32_rgb555_rb 00f800f8h |
57 | %define mmx32_rgb555_g 0000f800h |
||
58 | %define mmx32_rgb555_mul 20000008h |
||
59 | %define mmx32_bgr555_mul 00082000h |
||
60 | |||
8210 | maxcodehac | 61 | SECTION .text |
62 | |||
63 | _ConvertMMXpII32_24RGB888: |
||
64 | |||
65 | ; set up mm6 as the mask, mm7 as zero |
||
9172 | turbocat | 66 | load_immq mm6, mmx32_rgb888_mask |
67 | CLEANUP_IMMQ_LOADS(1) |
||
8210 | maxcodehac | 68 | pxor mm7, mm7 |
69 | |||
70 | mov edx, ecx ; save ecx |
||
71 | and ecx, 0fffffffch ; clear lower two bits |
||
72 | jnz .L1 |
||
73 | jmp .L2 |
||
74 | |||
75 | .L1: |
||
76 | |||
77 | movq mm0, [esi] ; A R G B a r g b |
||
78 | pand mm0, mm6 ; 0 R G B 0 r g b |
||
79 | movq mm1, [esi+8] ; A R G B a r g b |
||
80 | pand mm1, mm6 ; 0 R G B 0 r g b |
||
81 | |||
82 | movq mm2, mm0 ; 0 R G B 0 r g b |
||
83 | punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B |
||
84 | punpckldq mm0, mm7 ; 0 0 0 0 0 r g b |
||
85 | psllq mm2, 24 ; 0 0 R G B 0 0 0 |
||
86 | por mm0, mm2 ; 0 0 R G B r g b |
||
87 | |||
88 | movq mm3, mm1 ; 0 R G B 0 r g b |
||
89 | psllq mm3, 48 ; g b 0 0 0 0 0 0 |
||
90 | por mm0, mm3 ; g b R G B r g b |
||
91 | |||
92 | movq mm4, mm1 ; 0 R G B 0 r g b |
||
93 | punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B |
||
94 | punpckldq mm1, mm7 ; 0 0 0 0 0 r g b |
||
95 | psrlq mm1, 16 ; 0 0 0 R G B 0 r |
||
96 | psllq mm4, 8 ; 0 0 0 0 R G B 0 |
||
97 | por mm1, mm4 ; 0 0 0 0 R G B r |
||
98 | |||
99 | movq [edi], mm0 |
||
100 | add esi, BYTE 16 |
||
101 | movd [edi+8], mm1 |
||
102 | add edi, BYTE 12 |
||
103 | sub ecx, BYTE 4 |
||
104 | jnz .L1 |
||
105 | |||
106 | .L2: |
||
107 | mov ecx, edx |
||
108 | and ecx, BYTE 3 |
||
109 | jz .L4 |
||
110 | .L3: |
||
111 | mov al, [esi] |
||
112 | mov bl, [esi+1] |
||
113 | mov dl, [esi+2] |
||
114 | mov [edi], al |
||
115 | mov [edi+1], bl |
||
116 | mov [edi+2], dl |
||
117 | add esi, BYTE 4 |
||
118 | add edi, BYTE 3 |
||
119 | dec ecx |
||
120 | jnz .L3 |
||
121 | .L4: |
||
9172 | turbocat | 122 | retn |
8210 | maxcodehac | 123 | |
124 | |||
125 | |||
126 | _ConvertMMXpII32_16RGB565: |
||
127 | |||
128 | ; set up masks |
||
9172 | turbocat | 129 | load_immq mm5, mmx32_rgb565_b |
130 | load_immq mm6, mmx32_rgb565_g |
||
131 | load_immq mm7, mmx32_rgb565_r |
||
132 | CLEANUP_IMMQ_LOADS(3) |
||
8210 | maxcodehac | 133 | |
134 | mov edx, ecx |
||
135 | shr ecx, 2 |
||
136 | jnz .L1 |
||
137 | jmp .L2 ; not necessary at the moment, but doesn't hurt (much) |
||
138 | |||
139 | .L1: |
||
140 | movq mm0, [esi] ; argb |
||
141 | movq mm1, mm0 ; argb |
||
142 | pand mm0, mm6 ; 00g0 |
||
143 | movq mm3, mm1 ; argb |
||
144 | pand mm1, mm5 ; 000b |
||
145 | pand mm3, mm7 ; 0r00 |
||
146 | pslld mm1, 2 ; 0 0 000000bb bbb00000 |
||
147 | por mm0, mm1 ; 0 0 ggggggbb bbb00000 |
||
148 | psrld mm0, 5 ; 0 0 00000ggg gggbbbbb |
||
149 | |||
150 | movq mm4, [esi+8] ; argb |
||
151 | movq mm2, mm4 ; argb |
||
152 | pand mm4, mm6 ; 00g0 |
||
153 | movq mm1, mm2 ; argb |
||
154 | pand mm2, mm5 ; 000b |
||
155 | pand mm1, mm7 ; 0r00 |
||
156 | pslld mm2, 2 ; 0 0 000000bb bbb00000 |
||
157 | por mm4, mm2 ; 0 0 ggggggbb bbb00000 |
||
158 | psrld mm4, 5 ; 0 0 00000ggg gggbbbbb |
||
159 | |||
160 | packuswb mm3, mm1 ; R 0 r 0 |
||
161 | packssdw mm0, mm4 ; as above.. ish |
||
162 | por mm0, mm3 ; done. |
||
163 | movq [edi], mm0 |
||
164 | |||
165 | add esi, 16 |
||
166 | add edi, 8 |
||
167 | dec ecx |
||
168 | jnz .L1 |
||
169 | |||
170 | .L2: |
||
171 | mov ecx, edx |
||
172 | and ecx, BYTE 3 |
||
173 | jz .L4 |
||
174 | .L3: |
||
175 | mov al, [esi] |
||
176 | mov bh, [esi+1] |
||
177 | mov ah, [esi+2] |
||
178 | shr al, 3 |
||
179 | and eax, 0F81Fh ; BYTE? |
||
180 | shr ebx, 5 |
||
181 | and ebx, 07E0h ; BYTE? |
||
182 | add eax, ebx |
||
183 | mov [edi], al |
||
184 | mov [edi+1], ah |
||
185 | add esi, BYTE 4 |
||
186 | add edi, BYTE 2 |
||
187 | dec ecx |
||
188 | jnz .L3 |
||
189 | |||
190 | .L4: |
||
9172 | turbocat | 191 | retn |
8210 | maxcodehac | 192 | |
193 | |||
194 | _ConvertMMXpII32_16BGR565: |
||
195 | |||
9172 | turbocat | 196 | load_immq mm5, mmx32_rgb565_r |
197 | load_immq mm6, mmx32_rgb565_g |
||
198 | load_immq mm7, mmx32_rgb565_b |
||
199 | CLEANUP_IMMQ_LOADS(3) |
||
8210 | maxcodehac | 200 | |
201 | mov edx, ecx |
||
202 | shr ecx, 2 |
||
203 | jnz .L1 |
||
204 | jmp .L2 |
||
205 | |||
206 | .L1: |
||
207 | movq mm0, [esi] ; a r g b |
||
208 | movq mm1, mm0 ; a r g b |
||
209 | pand mm0, mm6 ; 0 0 g 0 |
||
210 | movq mm3, mm1 ; a r g b |
||
211 | pand mm1, mm5 ; 0 r 0 0 |
||
212 | pand mm3, mm7 ; 0 0 0 b |
||
213 | |||
214 | psllq mm3, 16 ; 0 b 0 0 |
||
215 | psrld mm1, 14 ; 0 0 000000rr rrr00000 |
||
216 | por mm0, mm1 ; 0 0 ggggggrr rrr00000 |
||
217 | psrld mm0, 5 ; 0 0 00000ggg gggrrrrr |
||
218 | |||
219 | movq mm4, [esi+8] ; a r g b |
||
220 | movq mm2, mm4 ; a r g b |
||
221 | pand mm4, mm6 ; 0 0 g 0 |
||
222 | movq mm1, mm2 ; a r g b |
||
223 | pand mm2, mm5 ; 0 r 0 0 |
||
224 | pand mm1, mm7 ; 0 0 0 b |
||
225 | |||
226 | psllq mm1, 16 ; 0 b 0 0 |
||
227 | psrld mm2, 14 ; 0 0 000000rr rrr00000 |
||
228 | por mm4, mm2 ; 0 0 ggggggrr rrr00000 |
||
229 | psrld mm4, 5 ; 0 0 00000ggg gggrrrrr |
||
230 | |||
231 | packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 |
||
232 | packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR |
||
233 | por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr |
||
234 | movq [edi], mm0 |
||
235 | |||
236 | add esi, BYTE 16 |
||
237 | add edi, BYTE 8 |
||
238 | dec ecx |
||
239 | jnz .L1 |
||
240 | |||
241 | .L2: |
||
242 | and edx, BYTE 3 |
||
243 | jz .L4 |
||
244 | .L3: |
||
245 | mov al, [esi+2] |
||
246 | mov bh, [esi+1] |
||
247 | mov ah, [esi] |
||
248 | shr al, 3 |
||
249 | and eax, 0F81Fh ; BYTE ? |
||
250 | shr ebx, 5 |
||
251 | and ebx, 07E0h ; BYTE ? |
||
252 | add eax, ebx |
||
253 | mov [edi], al |
||
254 | mov [edi+1], ah |
||
255 | add esi, BYTE 4 |
||
256 | add edi, BYTE 2 |
||
257 | dec edx |
||
258 | jnz .L3 |
||
259 | |||
260 | .L4: |
||
9172 | turbocat | 261 | retn |
8210 | maxcodehac | 262 | |
263 | _ConvertMMXpII32_16BGR555: |
||
264 | |||
265 | ; the 16BGR555 converter is identical to the RGB555 one, |
||
266 | ; except it uses a different multiplier for the pmaddwd |
||
267 | ; instruction. cool huh. |
||
268 | |||
9172 | turbocat | 269 | load_immq mm7, mmx32_bgr555_mul |
8210 | maxcodehac | 270 | jmp _convert_bgr555_cheat |
271 | |||
272 | ; This is the same as the Intel version.. they obviously went to |
||
273 | ; much more trouble to expand/coil the loop than I did, so theirs |
||
274 | ; would almost certainly be faster, even if only a little. |
||
275 | ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is |
||
276 | ; (I think) a more accurate name.. |
||
277 | _ConvertMMXpII32_16RGB555: |
||
278 | |||
9172 | turbocat | 279 | load_immq mm7, mmx32_rgb555_mul |
8210 | maxcodehac | 280 | _convert_bgr555_cheat: |
9172 | turbocat | 281 | load_immq mm6, mmx32_rgb555_g |
282 | CLEANUP_IMMQ_LOADS(2) |
||
8210 | maxcodehac | 283 | |
284 | mov edx,ecx ; Save ecx |
||
285 | |||
9172 | turbocat | 286 | and ecx,DWORD 0fffffff8h ; clear lower three bits |
8210 | maxcodehac | 287 | jnz .L_OK |
9172 | turbocat | 288 | jmp near .L2 |
8210 | maxcodehac | 289 | |
290 | .L_OK: |
||
291 | |||
292 | movq mm2,[esi+8] |
||
293 | |||
294 | movq mm0,[esi] |
||
295 | movq mm3,mm2 |
||
296 | |||
9172 | turbocat | 297 | pand_immq mm3, mmx32_rgb555_rb |
8210 | maxcodehac | 298 | movq mm1,mm0 |
299 | |||
9172 | turbocat | 300 | pand_immq mm1, mmx32_rgb555_rb |
8210 | maxcodehac | 301 | pmaddwd mm3,mm7 |
302 | |||
9172 | turbocat | 303 | CLEANUP_IMMQ_LOADS(2) |
304 | |||
8210 | maxcodehac | 305 | pmaddwd mm1,mm7 |
306 | pand mm2,mm6 |
||
307 | |||
308 | .L1: |
||
309 | movq mm4,[esi+24] |
||
310 | pand mm0,mm6 |
||
311 | |||
312 | movq mm5,[esi+16] |
||
313 | por mm3,mm2 |
||
314 | |||
315 | psrld mm3,6 |
||
316 | por mm1,mm0 |
||
317 | |||
318 | movq mm0,mm4 |
||
319 | psrld mm1,6 |
||
320 | |||
9172 | turbocat | 321 | pand_immq mm0, mmx32_rgb555_rb |
8210 | maxcodehac | 322 | packssdw mm1,mm3 |
323 | |||
324 | movq mm3,mm5 |
||
325 | pmaddwd mm0,mm7 |
||
326 | |||
9172 | turbocat | 327 | pand_immq mm3, mmx32_rgb555_rb |
8210 | maxcodehac | 328 | pand mm4,mm6 |
329 | |||
330 | movq [edi],mm1 |
||
331 | pmaddwd mm3,mm7 |
||
332 | |||
333 | add esi,BYTE 32 |
||
334 | por mm4,mm0 |
||
335 | |||
336 | pand mm5,mm6 |
||
337 | psrld mm4,6 |
||
338 | |||
339 | movq mm2,[esi+8] |
||
340 | por mm5,mm3 |
||
341 | |||
342 | movq mm0,[esi] |
||
343 | psrld mm5,6 |
||
344 | |||
345 | movq mm3,mm2 |
||
346 | movq mm1,mm0 |
||
347 | |||
9172 | turbocat | 348 | pand_immq mm3, mmx32_rgb555_rb |
8210 | maxcodehac | 349 | packssdw mm5,mm4 |
350 | |||
9172 | turbocat | 351 | pand_immq mm1, mmx32_rgb555_rb |
8210 | maxcodehac | 352 | pand mm2,mm6 |
353 | |||
9172 | turbocat | 354 | CLEANUP_IMMQ_LOADS(4) |
355 | |||
8210 | maxcodehac | 356 | movq [edi+8],mm5 |
357 | pmaddwd mm3,mm7 |
||
358 | |||
359 | pmaddwd mm1,mm7 |
||
360 | add edi,BYTE 16 |
||
361 | |||
362 | sub ecx,BYTE 8 |
||
363 | jz .L2 |
||
364 | jmp .L1 |
||
365 | |||
366 | |||
367 | .L2: |
||
368 | mov ecx,edx |
||
369 | |||
370 | and ecx,BYTE 7 |
||
371 | jz .L4 |
||
372 | |||
373 | .L3: |
||
374 | mov ebx,[esi] |
||
375 | add esi,BYTE 4 |
||
376 | |||
377 | mov eax,ebx |
||
378 | mov edx,ebx |
||
379 | |||
380 | shr eax,3 |
||
381 | shr edx,6 |
||
382 | |||
383 | and eax,BYTE 0000000000011111b |
||
384 | and edx, 0000001111100000b |
||
385 | |||
386 | shr ebx,9 |
||
387 | |||
388 | or eax,edx |
||
389 | |||
390 | and ebx, 0111110000000000b |
||
391 | |||
392 | or eax,ebx |
||
393 | |||
394 | mov [edi],ax |
||
395 | add edi,BYTE 2 |
||
396 | |||
397 | dec ecx |
||
398 | jnz .L3 |
||
399 | |||
400 | .L4: |
||
9172 | turbocat | 401 | retn |
8210 | maxcodehac | 402 | |
9172 | turbocat | 403 | %ifidn __OUTPUT_FORMAT__,elf32 |
404 | section .note.GNU-stack noalloc noexec nowrite progbits |
||
405 | %endif |