Rev 9172 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
8210 | maxcodehac | 1 | ; |
2 | ; pII-optimised MMX format converters for HERMES |
||
3 | ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) |
||
4 | ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) |
||
5 | ; This source code is licensed under the GNU LGPL |
||
6 | ; |
||
7 | ; Please refer to the file COPYING.LIB contained in the distribution for |
||
8 | ; licensing conditions |
||
9 | ; |
||
10 | ; COPYRIGHT NOTICE |
||
11 | ; |
||
12 | ; This file partly contains code that is (c) Intel Corporation, specifically |
||
13 | ; the mode detection routine, and the converter to 15 bit (8 pixel |
||
14 | ; conversion routine from the mmx programming tutorial pages). |
||
15 | ; |
||
16 | ; |
||
17 | ; These routines aren't exactly pII optimised - it's just that as they |
||
18 | ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to |
||
19 | ; optimise them for p5 MMXs.. |
||
20 | |||
21 | BITS 32 |
||
22 | |||
9172 | turbocat | 23 | %include "common.inc" |
8210 | maxcodehac | 24 | |
9172 | turbocat | 25 | SDL_FUNC _ConvertMMXpII32_24RGB888 |
26 | SDL_FUNC _ConvertMMXpII32_16RGB565 |
||
27 | SDL_FUNC _ConvertMMXpII32_16BGR565 |
||
28 | SDL_FUNC _ConvertMMXpII32_16RGB555 |
||
29 | SDL_FUNC _ConvertMMXpII32_16BGR555 |
||
8210 | maxcodehac | 30 | |
9202 | turbocat | 31 | SDL_FUNC ConvertMMXpII32_24RGB888 |
32 | SDL_FUNC ConvertMMXpII32_16RGB565 |
||
33 | SDL_FUNC ConvertMMXpII32_16BGR565 |
||
34 | SDL_FUNC ConvertMMXpII32_16RGB555 |
||
35 | SDL_FUNC ConvertMMXpII32_16BGR555 |
||
36 | |||
37 | |||
9172 | turbocat | 38 | ;; Macros for conversion routines |
8210 | maxcodehac | 39 | |
9172 | turbocat | 40 | %macro _push_immq_mask 1 |
41 | push dword %1 |
||
42 | push dword %1 |
||
43 | %endmacro |
||
8210 | maxcodehac | 44 | |
9172 | turbocat | 45 | %macro load_immq 2 |
46 | _push_immq_mask %2 |
||
47 | movq %1, [esp] |
||
48 | %endmacro |
||
8210 | maxcodehac | 49 | |
9172 | turbocat | 50 | %macro pand_immq 2 |
51 | _push_immq_mask %2 |
||
52 | pand %1, [esp] |
||
53 | %endmacro |
||
8210 | maxcodehac | 54 | |
9172 | turbocat | 55 | %define CLEANUP_IMMQ_LOADS(num) \ |
56 | add esp, byte 8 * num |
||
8210 | maxcodehac | 57 | |
9172 | turbocat | 58 | %define mmx32_rgb888_mask 00ffffffh |
59 | %define mmx32_rgb565_b 000000f8h |
||
60 | %define mmx32_rgb565_g 0000fc00h |
||
61 | %define mmx32_rgb565_r 00f80000h |
||
8210 | maxcodehac | 62 | |
9172 | turbocat | 63 | %define mmx32_rgb555_rb 00f800f8h |
64 | %define mmx32_rgb555_g 0000f800h |
||
65 | %define mmx32_rgb555_mul 20000008h |
||
66 | %define mmx32_bgr555_mul 00082000h |
||
67 | |||
8210 | maxcodehac | 68 | SECTION .text |
69 | |||
9202 | turbocat | 70 | ConvertMMXpII32_24RGB888: |
8210 | maxcodehac | 71 | _ConvertMMXpII32_24RGB888: |
72 | |||
73 | ; set up mm6 as the mask, mm7 as zero |
||
9172 | turbocat | 74 | load_immq mm6, mmx32_rgb888_mask |
75 | CLEANUP_IMMQ_LOADS(1) |
||
8210 | maxcodehac | 76 | pxor mm7, mm7 |
77 | |||
78 | mov edx, ecx ; save ecx |
||
79 | and ecx, 0fffffffch ; clear lower two bits |
||
80 | jnz .L1 |
||
81 | jmp .L2 |
||
82 | |||
83 | .L1: |
||
84 | |||
85 | movq mm0, [esi] ; A R G B a r g b |
||
86 | pand mm0, mm6 ; 0 R G B 0 r g b |
||
87 | movq mm1, [esi+8] ; A R G B a r g b |
||
88 | pand mm1, mm6 ; 0 R G B 0 r g b |
||
89 | |||
90 | movq mm2, mm0 ; 0 R G B 0 r g b |
||
91 | punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B |
||
92 | punpckldq mm0, mm7 ; 0 0 0 0 0 r g b |
||
93 | psllq mm2, 24 ; 0 0 R G B 0 0 0 |
||
94 | por mm0, mm2 ; 0 0 R G B r g b |
||
95 | |||
96 | movq mm3, mm1 ; 0 R G B 0 r g b |
||
97 | psllq mm3, 48 ; g b 0 0 0 0 0 0 |
||
98 | por mm0, mm3 ; g b R G B r g b |
||
99 | |||
100 | movq mm4, mm1 ; 0 R G B 0 r g b |
||
101 | punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B |
||
102 | punpckldq mm1, mm7 ; 0 0 0 0 0 r g b |
||
103 | psrlq mm1, 16 ; 0 0 0 R G B 0 r |
||
104 | psllq mm4, 8 ; 0 0 0 0 R G B 0 |
||
105 | por mm1, mm4 ; 0 0 0 0 R G B r |
||
106 | |||
107 | movq [edi], mm0 |
||
108 | add esi, BYTE 16 |
||
109 | movd [edi+8], mm1 |
||
110 | add edi, BYTE 12 |
||
111 | sub ecx, BYTE 4 |
||
112 | jnz .L1 |
||
113 | |||
114 | .L2: |
||
115 | mov ecx, edx |
||
116 | and ecx, BYTE 3 |
||
117 | jz .L4 |
||
118 | .L3: |
||
119 | mov al, [esi] |
||
120 | mov bl, [esi+1] |
||
121 | mov dl, [esi+2] |
||
122 | mov [edi], al |
||
123 | mov [edi+1], bl |
||
124 | mov [edi+2], dl |
||
125 | add esi, BYTE 4 |
||
126 | add edi, BYTE 3 |
||
127 | dec ecx |
||
128 | jnz .L3 |
||
129 | .L4: |
||
9172 | turbocat | 130 | retn |
8210 | maxcodehac | 131 | |
132 | |||
9202 | turbocat | 133 | ConvertMMXpII32_16RGB565: |
8210 | maxcodehac | 134 | _ConvertMMXpII32_16RGB565: |
135 | |||
136 | ; set up masks |
||
9172 | turbocat | 137 | load_immq mm5, mmx32_rgb565_b |
138 | load_immq mm6, mmx32_rgb565_g |
||
139 | load_immq mm7, mmx32_rgb565_r |
||
140 | CLEANUP_IMMQ_LOADS(3) |
||
8210 | maxcodehac | 141 | |
142 | mov edx, ecx |
||
143 | shr ecx, 2 |
||
144 | jnz .L1 |
||
145 | jmp .L2 ; not necessary at the moment, but doesn't hurt (much) |
||
146 | |||
147 | .L1: |
||
148 | movq mm0, [esi] ; argb |
||
149 | movq mm1, mm0 ; argb |
||
150 | pand mm0, mm6 ; 00g0 |
||
151 | movq mm3, mm1 ; argb |
||
152 | pand mm1, mm5 ; 000b |
||
153 | pand mm3, mm7 ; 0r00 |
||
154 | pslld mm1, 2 ; 0 0 000000bb bbb00000 |
||
155 | por mm0, mm1 ; 0 0 ggggggbb bbb00000 |
||
156 | psrld mm0, 5 ; 0 0 00000ggg gggbbbbb |
||
157 | |||
158 | movq mm4, [esi+8] ; argb |
||
159 | movq mm2, mm4 ; argb |
||
160 | pand mm4, mm6 ; 00g0 |
||
161 | movq mm1, mm2 ; argb |
||
162 | pand mm2, mm5 ; 000b |
||
163 | pand mm1, mm7 ; 0r00 |
||
164 | pslld mm2, 2 ; 0 0 000000bb bbb00000 |
||
165 | por mm4, mm2 ; 0 0 ggggggbb bbb00000 |
||
166 | psrld mm4, 5 ; 0 0 00000ggg gggbbbbb |
||
167 | |||
168 | packuswb mm3, mm1 ; R 0 r 0 |
||
169 | packssdw mm0, mm4 ; as above.. ish |
||
170 | por mm0, mm3 ; done. |
||
171 | movq [edi], mm0 |
||
172 | |||
173 | add esi, 16 |
||
174 | add edi, 8 |
||
175 | dec ecx |
||
176 | jnz .L1 |
||
177 | |||
178 | .L2: |
||
179 | mov ecx, edx |
||
180 | and ecx, BYTE 3 |
||
181 | jz .L4 |
||
182 | .L3: |
||
183 | mov al, [esi] |
||
184 | mov bh, [esi+1] |
||
185 | mov ah, [esi+2] |
||
186 | shr al, 3 |
||
187 | and eax, 0F81Fh ; BYTE? |
||
188 | shr ebx, 5 |
||
189 | and ebx, 07E0h ; BYTE? |
||
190 | add eax, ebx |
||
191 | mov [edi], al |
||
192 | mov [edi+1], ah |
||
193 | add esi, BYTE 4 |
||
194 | add edi, BYTE 2 |
||
195 | dec ecx |
||
196 | jnz .L3 |
||
197 | |||
198 | .L4: |
||
9172 | turbocat | 199 | retn |
8210 | maxcodehac | 200 | |
9202 | turbocat | 201 | ConvertMMXpII32_16BGR565: |
8210 | maxcodehac | 202 | _ConvertMMXpII32_16BGR565: |
203 | |||
9172 | turbocat | 204 | load_immq mm5, mmx32_rgb565_r |
205 | load_immq mm6, mmx32_rgb565_g |
||
206 | load_immq mm7, mmx32_rgb565_b |
||
207 | CLEANUP_IMMQ_LOADS(3) |
||
8210 | maxcodehac | 208 | |
209 | mov edx, ecx |
||
210 | shr ecx, 2 |
||
211 | jnz .L1 |
||
212 | jmp .L2 |
||
213 | |||
214 | .L1: |
||
215 | movq mm0, [esi] ; a r g b |
||
216 | movq mm1, mm0 ; a r g b |
||
217 | pand mm0, mm6 ; 0 0 g 0 |
||
218 | movq mm3, mm1 ; a r g b |
||
219 | pand mm1, mm5 ; 0 r 0 0 |
||
220 | pand mm3, mm7 ; 0 0 0 b |
||
221 | |||
222 | psllq mm3, 16 ; 0 b 0 0 |
||
223 | psrld mm1, 14 ; 0 0 000000rr rrr00000 |
||
224 | por mm0, mm1 ; 0 0 ggggggrr rrr00000 |
||
225 | psrld mm0, 5 ; 0 0 00000ggg gggrrrrr |
||
226 | |||
227 | movq mm4, [esi+8] ; a r g b |
||
228 | movq mm2, mm4 ; a r g b |
||
229 | pand mm4, mm6 ; 0 0 g 0 |
||
230 | movq mm1, mm2 ; a r g b |
||
231 | pand mm2, mm5 ; 0 r 0 0 |
||
232 | pand mm1, mm7 ; 0 0 0 b |
||
233 | |||
234 | psllq mm1, 16 ; 0 b 0 0 |
||
235 | psrld mm2, 14 ; 0 0 000000rr rrr00000 |
||
236 | por mm4, mm2 ; 0 0 ggggggrr rrr00000 |
||
237 | psrld mm4, 5 ; 0 0 00000ggg gggrrrrr |
||
238 | |||
239 | packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 |
||
240 | packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR |
||
241 | por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr |
||
242 | movq [edi], mm0 |
||
243 | |||
244 | add esi, BYTE 16 |
||
245 | add edi, BYTE 8 |
||
246 | dec ecx |
||
247 | jnz .L1 |
||
248 | |||
249 | .L2: |
||
250 | and edx, BYTE 3 |
||
251 | jz .L4 |
||
252 | .L3: |
||
253 | mov al, [esi+2] |
||
254 | mov bh, [esi+1] |
||
255 | mov ah, [esi] |
||
256 | shr al, 3 |
||
257 | and eax, 0F81Fh ; BYTE ? |
||
258 | shr ebx, 5 |
||
259 | and ebx, 07E0h ; BYTE ? |
||
260 | add eax, ebx |
||
261 | mov [edi], al |
||
262 | mov [edi+1], ah |
||
263 | add esi, BYTE 4 |
||
264 | add edi, BYTE 2 |
||
265 | dec edx |
||
266 | jnz .L3 |
||
267 | |||
268 | .L4: |
||
9172 | turbocat | 269 | retn |
8210 | maxcodehac | 270 | |
9202 | turbocat | 271 | ConvertMMXpII32_16BGR555: |
8210 | maxcodehac | 272 | _ConvertMMXpII32_16BGR555: |
273 | |||
274 | ; the 16BGR555 converter is identical to the RGB555 one, |
||
275 | ; except it uses a different multiplier for the pmaddwd |
||
276 | ; instruction. cool huh. |
||
277 | |||
9172 | turbocat | 278 | load_immq mm7, mmx32_bgr555_mul |
8210 | maxcodehac | 279 | jmp _convert_bgr555_cheat |
280 | |||
281 | ; This is the same as the Intel version.. they obviously went to |
||
282 | ; much more trouble to expand/coil the loop than I did, so theirs |
||
283 | ; would almost certainly be faster, even if only a little. |
||
284 | ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is |
||
285 | ; (I think) a more accurate name.. |
||
9202 | turbocat | 286 | |
287 | ConvertMMXpII32_16RGB555: |
||
8210 | maxcodehac | 288 | _ConvertMMXpII32_16RGB555: |
289 | |||
9172 | turbocat | 290 | load_immq mm7, mmx32_rgb555_mul |
8210 | maxcodehac | 291 | _convert_bgr555_cheat: |
9172 | turbocat | 292 | load_immq mm6, mmx32_rgb555_g |
293 | CLEANUP_IMMQ_LOADS(2) |
||
8210 | maxcodehac | 294 | |
295 | mov edx,ecx ; Save ecx |
||
296 | |||
9172 | turbocat | 297 | and ecx,DWORD 0fffffff8h ; clear lower three bits |
8210 | maxcodehac | 298 | jnz .L_OK |
9172 | turbocat | 299 | jmp near .L2 |
8210 | maxcodehac | 300 | |
301 | .L_OK: |
||
302 | |||
303 | movq mm2,[esi+8] |
||
304 | |||
305 | movq mm0,[esi] |
||
306 | movq mm3,mm2 |
||
307 | |||
9172 | turbocat | 308 | pand_immq mm3, mmx32_rgb555_rb |
8210 | maxcodehac | 309 | movq mm1,mm0 |
310 | |||
9172 | turbocat | 311 | pand_immq mm1, mmx32_rgb555_rb |
8210 | maxcodehac | 312 | pmaddwd mm3,mm7 |
313 | |||
9172 | turbocat | 314 | CLEANUP_IMMQ_LOADS(2) |
315 | |||
8210 | maxcodehac | 316 | pmaddwd mm1,mm7 |
317 | pand mm2,mm6 |
||
318 | |||
319 | .L1: |
||
320 | movq mm4,[esi+24] |
||
321 | pand mm0,mm6 |
||
322 | |||
323 | movq mm5,[esi+16] |
||
324 | por mm3,mm2 |
||
325 | |||
326 | psrld mm3,6 |
||
327 | por mm1,mm0 |
||
328 | |||
329 | movq mm0,mm4 |
||
330 | psrld mm1,6 |
||
331 | |||
9172 | turbocat | 332 | pand_immq mm0, mmx32_rgb555_rb |
8210 | maxcodehac | 333 | packssdw mm1,mm3 |
334 | |||
335 | movq mm3,mm5 |
||
336 | pmaddwd mm0,mm7 |
||
337 | |||
9172 | turbocat | 338 | pand_immq mm3, mmx32_rgb555_rb |
8210 | maxcodehac | 339 | pand mm4,mm6 |
340 | |||
341 | movq [edi],mm1 |
||
342 | pmaddwd mm3,mm7 |
||
343 | |||
344 | add esi,BYTE 32 |
||
345 | por mm4,mm0 |
||
346 | |||
347 | pand mm5,mm6 |
||
348 | psrld mm4,6 |
||
349 | |||
350 | movq mm2,[esi+8] |
||
351 | por mm5,mm3 |
||
352 | |||
353 | movq mm0,[esi] |
||
354 | psrld mm5,6 |
||
355 | |||
356 | movq mm3,mm2 |
||
357 | movq mm1,mm0 |
||
358 | |||
9172 | turbocat | 359 | pand_immq mm3, mmx32_rgb555_rb |
8210 | maxcodehac | 360 | packssdw mm5,mm4 |
361 | |||
9172 | turbocat | 362 | pand_immq mm1, mmx32_rgb555_rb |
8210 | maxcodehac | 363 | pand mm2,mm6 |
364 | |||
9172 | turbocat | 365 | CLEANUP_IMMQ_LOADS(4) |
366 | |||
8210 | maxcodehac | 367 | movq [edi+8],mm5 |
368 | pmaddwd mm3,mm7 |
||
369 | |||
370 | pmaddwd mm1,mm7 |
||
371 | add edi,BYTE 16 |
||
372 | |||
373 | sub ecx,BYTE 8 |
||
374 | jz .L2 |
||
375 | jmp .L1 |
||
376 | |||
377 | |||
378 | .L2: |
||
379 | mov ecx,edx |
||
380 | |||
381 | and ecx,BYTE 7 |
||
382 | jz .L4 |
||
383 | |||
384 | .L3: |
||
385 | mov ebx,[esi] |
||
386 | add esi,BYTE 4 |
||
387 | |||
388 | mov eax,ebx |
||
389 | mov edx,ebx |
||
390 | |||
391 | shr eax,3 |
||
392 | shr edx,6 |
||
393 | |||
394 | and eax,BYTE 0000000000011111b |
||
395 | and edx, 0000001111100000b |
||
396 | |||
397 | shr ebx,9 |
||
398 | |||
399 | or eax,edx |
||
400 | |||
401 | and ebx, 0111110000000000b |
||
402 | |||
403 | or eax,ebx |
||
404 | |||
405 | mov [edi],ax |
||
406 | add edi,BYTE 2 |
||
407 | |||
408 | dec ecx |
||
409 | jnz .L3 |
||
410 | |||
411 | .L4: |
||
9172 | turbocat | 412 | retn |
8210 | maxcodehac | 413 | |
9172 | turbocat | 414 | %ifidn __OUTPUT_FORMAT__,elf32 |
415 | section .note.GNU-stack noalloc noexec nowrite progbits |
||
416 | %endif |