Rev 6347 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5131 | clevermous | 1 | ; |
2 | ; pII-optimised MMX format converters for HERMES |
||
3 | ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) |
||
4 | ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) |
||
5 | ; This source code is licensed under the GNU LGPL |
||
6 | ; |
||
7 | ; Please refer to the file COPYING.LIB contained in the distribution for |
||
8 | ; licensing conditions |
||
9 | ; |
||
10 | ; COPYRIGHT NOTICE |
||
11 | ; |
||
12 | ; This file partly contains code that is (c) Intel Corporation, specifically |
||
13 | ; the mode detection routine, and the converter to 15 bit (8 pixel |
||
14 | ; conversion routine from the mmx programming tutorial pages). |
||
15 | ; |
||
16 | ; |
||
17 | ; These routines aren't exactly pII optimised - it's just that as they |
||
18 | ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to |
||
19 | ; optimise them for p5 MMXs.. |
||
20 | |||
21 | BITS 32 |
||
22 | |||
6386 | ashmew2 | 23 | |
5131 | clevermous | 24 | GLOBAL _ConvertMMXpII32_24RGB888 |
25 | GLOBAL _ConvertMMXpII32_16RGB565 |
||
26 | GLOBAL _ConvertMMXpII32_16BGR565 |
||
27 | GLOBAL _ConvertMMXpII32_16RGB555 |
||
28 | GLOBAL _ConvertMMXpII32_16BGR555 |
||
29 | |||
30 | EXTERN _mmxreturn |
||
31 | |||
32 | SECTION .data |
||
33 | |||
34 | ALIGN 8 |
||
35 | |||
36 | ;; Constants for conversion routines |
||
37 | |||
38 | mmx32_rgb888_mask dd 00ffffffh,00ffffffh |
||
39 | |||
40 | mmx32_rgb565_b dd 000000f8h, 000000f8h |
||
41 | mmx32_rgb565_g dd 0000fc00h, 0000fc00h |
||
42 | mmx32_rgb565_r dd 00f80000h, 00f80000h |
||
43 | |||
44 | mmx32_rgb555_rb dd 00f800f8h,00f800f8h |
||
45 | mmx32_rgb555_g dd 0000f800h,0000f800h |
||
46 | mmx32_rgb555_mul dd 20000008h,20000008h |
||
47 | mmx32_bgr555_mul dd 00082000h,00082000h |
||
48 | |||
49 | |||
50 | |||
51 | SECTION .text |
||
52 | |||
53 | _ConvertMMXpII32_24RGB888: |
||
54 | |||
55 | ; set up mm6 as the mask, mm7 as zero |
||
56 | movq mm6, qword [mmx32_rgb888_mask] |
||
57 | pxor mm7, mm7 |
||
58 | |||
59 | mov edx, ecx ; save ecx |
||
60 | and ecx, 0fffffffch ; clear lower two bits |
||
61 | jnz .L1 |
||
62 | jmp .L2 |
||
63 | |||
64 | .L1: |
||
65 | |||
66 | movq mm0, [esi] ; A R G B a r g b |
||
67 | pand mm0, mm6 ; 0 R G B 0 r g b |
||
68 | movq mm1, [esi+8] ; A R G B a r g b |
||
69 | pand mm1, mm6 ; 0 R G B 0 r g b |
||
70 | |||
71 | movq mm2, mm0 ; 0 R G B 0 r g b |
||
72 | punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B |
||
73 | punpckldq mm0, mm7 ; 0 0 0 0 0 r g b |
||
74 | psllq mm2, 24 ; 0 0 R G B 0 0 0 |
||
75 | por mm0, mm2 ; 0 0 R G B r g b |
||
76 | |||
77 | movq mm3, mm1 ; 0 R G B 0 r g b |
||
78 | psllq mm3, 48 ; g b 0 0 0 0 0 0 |
||
79 | por mm0, mm3 ; g b R G B r g b |
||
80 | |||
81 | movq mm4, mm1 ; 0 R G B 0 r g b |
||
82 | punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B |
||
83 | punpckldq mm1, mm7 ; 0 0 0 0 0 r g b |
||
84 | psrlq mm1, 16 ; 0 0 0 R G B 0 r |
||
85 | psllq mm4, 8 ; 0 0 0 0 R G B 0 |
||
86 | por mm1, mm4 ; 0 0 0 0 R G B r |
||
87 | |||
88 | movq [edi], mm0 |
||
89 | add esi, BYTE 16 |
||
90 | movd [edi+8], mm1 |
||
91 | add edi, BYTE 12 |
||
92 | sub ecx, BYTE 4 |
||
93 | jnz .L1 |
||
94 | |||
95 | .L2: |
||
96 | mov ecx, edx |
||
97 | and ecx, BYTE 3 |
||
98 | jz .L4 |
||
99 | .L3: |
||
100 | mov al, [esi] |
||
101 | mov bl, [esi+1] |
||
102 | mov dl, [esi+2] |
||
103 | mov [edi], al |
||
104 | mov [edi+1], bl |
||
105 | mov [edi+2], dl |
||
106 | add esi, BYTE 4 |
||
107 | add edi, BYTE 3 |
||
108 | dec ecx |
||
109 | jnz .L3 |
||
110 | .L4: |
||
111 | jmp _mmxreturn |
||
112 | |||
113 | |||
114 | |||
115 | _ConvertMMXpII32_16RGB565: |
||
116 | |||
117 | ; set up masks |
||
118 | movq mm5, [mmx32_rgb565_b] |
||
119 | movq mm6, [mmx32_rgb565_g] |
||
120 | movq mm7, [mmx32_rgb565_r] |
||
121 | |||
122 | mov edx, ecx |
||
123 | shr ecx, 2 |
||
124 | jnz .L1 |
||
125 | jmp .L2 ; not necessary at the moment, but doesn't hurt (much) |
||
126 | |||
127 | .L1: |
||
128 | movq mm0, [esi] ; argb |
||
129 | movq mm1, mm0 ; argb |
||
130 | pand mm0, mm6 ; 00g0 |
||
131 | movq mm3, mm1 ; argb |
||
132 | pand mm1, mm5 ; 000b |
||
133 | pand mm3, mm7 ; 0r00 |
||
134 | pslld mm1, 2 ; 0 0 000000bb bbb00000 |
||
135 | por mm0, mm1 ; 0 0 ggggggbb bbb00000 |
||
136 | psrld mm0, 5 ; 0 0 00000ggg gggbbbbb |
||
137 | |||
138 | movq mm4, [esi+8] ; argb |
||
139 | movq mm2, mm4 ; argb |
||
140 | pand mm4, mm6 ; 00g0 |
||
141 | movq mm1, mm2 ; argb |
||
142 | pand mm2, mm5 ; 000b |
||
143 | pand mm1, mm7 ; 0r00 |
||
144 | pslld mm2, 2 ; 0 0 000000bb bbb00000 |
||
145 | por mm4, mm2 ; 0 0 ggggggbb bbb00000 |
||
146 | psrld mm4, 5 ; 0 0 00000ggg gggbbbbb |
||
147 | |||
148 | packuswb mm3, mm1 ; R 0 r 0 |
||
149 | packssdw mm0, mm4 ; as above.. ish |
||
150 | por mm0, mm3 ; done. |
||
151 | movq [edi], mm0 |
||
152 | |||
153 | add esi, 16 |
||
154 | add edi, 8 |
||
155 | dec ecx |
||
156 | jnz .L1 |
||
157 | |||
158 | .L2: |
||
159 | mov ecx, edx |
||
160 | and ecx, BYTE 3 |
||
161 | jz .L4 |
||
162 | .L3: |
||
163 | mov al, [esi] |
||
164 | mov bh, [esi+1] |
||
165 | mov ah, [esi+2] |
||
166 | shr al, 3 |
||
167 | and eax, 0F81Fh ; BYTE? |
||
168 | shr ebx, 5 |
||
169 | and ebx, 07E0h ; BYTE? |
||
170 | add eax, ebx |
||
171 | mov [edi], al |
||
172 | mov [edi+1], ah |
||
173 | add esi, BYTE 4 |
||
174 | add edi, BYTE 2 |
||
175 | dec ecx |
||
176 | jnz .L3 |
||
177 | |||
178 | .L4: |
||
179 | jmp _mmxreturn |
||
180 | |||
181 | |||
182 | _ConvertMMXpII32_16BGR565: |
||
183 | |||
184 | movq mm5, [mmx32_rgb565_r] |
||
185 | movq mm6, [mmx32_rgb565_g] |
||
186 | movq mm7, [mmx32_rgb565_b] |
||
187 | |||
188 | mov edx, ecx |
||
189 | shr ecx, 2 |
||
190 | jnz .L1 |
||
191 | jmp .L2 |
||
192 | |||
193 | .L1: |
||
194 | movq mm0, [esi] ; a r g b |
||
195 | movq mm1, mm0 ; a r g b |
||
196 | pand mm0, mm6 ; 0 0 g 0 |
||
197 | movq mm3, mm1 ; a r g b |
||
198 | pand mm1, mm5 ; 0 r 0 0 |
||
199 | pand mm3, mm7 ; 0 0 0 b |
||
200 | |||
201 | psllq mm3, 16 ; 0 b 0 0 |
||
202 | psrld mm1, 14 ; 0 0 000000rr rrr00000 |
||
203 | por mm0, mm1 ; 0 0 ggggggrr rrr00000 |
||
204 | psrld mm0, 5 ; 0 0 00000ggg gggrrrrr |
||
205 | |||
206 | movq mm4, [esi+8] ; a r g b |
||
207 | movq mm2, mm4 ; a r g b |
||
208 | pand mm4, mm6 ; 0 0 g 0 |
||
209 | movq mm1, mm2 ; a r g b |
||
210 | pand mm2, mm5 ; 0 r 0 0 |
||
211 | pand mm1, mm7 ; 0 0 0 b |
||
212 | |||
213 | psllq mm1, 16 ; 0 b 0 0 |
||
214 | psrld mm2, 14 ; 0 0 000000rr rrr00000 |
||
215 | por mm4, mm2 ; 0 0 ggggggrr rrr00000 |
||
216 | psrld mm4, 5 ; 0 0 00000ggg gggrrrrr |
||
217 | |||
218 | packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 |
||
219 | packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR |
||
220 | por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr |
||
221 | movq [edi], mm0 |
||
222 | |||
223 | add esi, BYTE 16 |
||
224 | add edi, BYTE 8 |
||
225 | dec ecx |
||
226 | jnz .L1 |
||
227 | |||
228 | .L2: |
||
229 | and edx, BYTE 3 |
||
230 | jz .L4 |
||
231 | .L3: |
||
232 | mov al, [esi+2] |
||
233 | mov bh, [esi+1] |
||
234 | mov ah, [esi] |
||
235 | shr al, 3 |
||
236 | and eax, 0F81Fh ; BYTE ? |
||
237 | shr ebx, 5 |
||
238 | and ebx, 07E0h ; BYTE ? |
||
239 | add eax, ebx |
||
240 | mov [edi], al |
||
241 | mov [edi+1], ah |
||
242 | add esi, BYTE 4 |
||
243 | add edi, BYTE 2 |
||
244 | dec edx |
||
245 | jnz .L3 |
||
246 | |||
247 | .L4: |
||
248 | jmp _mmxreturn |
||
249 | |||
250 | _ConvertMMXpII32_16BGR555: |
||
251 | |||
252 | ; the 16BGR555 converter is identical to the RGB555 one, |
||
253 | ; except it uses a different multiplier for the pmaddwd |
||
254 | ; instruction. cool huh. |
||
255 | |||
256 | movq mm7, qword [mmx32_bgr555_mul] |
||
257 | jmp _convert_bgr555_cheat |
||
258 | |||
259 | ; This is the same as the Intel version.. they obviously went to |
||
260 | ; much more trouble to expand/coil the loop than I did, so theirs |
||
261 | ; would almost certainly be faster, even if only a little. |
||
262 | ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is |
||
263 | ; (I think) a more accurate name.. |
||
264 | _ConvertMMXpII32_16RGB555: |
||
265 | |||
266 | movq mm7,qword [mmx32_rgb555_mul] |
||
267 | _convert_bgr555_cheat: |
||
268 | movq mm6,qword [mmx32_rgb555_g] |
||
269 | |||
270 | mov edx,ecx ; Save ecx |
||
271 | |||
272 | and ecx,BYTE 0fffffff8h ; clear lower three bits |
||
273 | jnz .L_OK |
||
274 | jmp .L2 |
||
275 | |||
276 | .L_OK: |
||
277 | |||
278 | movq mm2,[esi+8] |
||
279 | |||
280 | movq mm0,[esi] |
||
281 | movq mm3,mm2 |
||
282 | |||
283 | pand mm3,qword [mmx32_rgb555_rb] |
||
284 | movq mm1,mm0 |
||
285 | |||
286 | pand mm1,qword [mmx32_rgb555_rb] |
||
287 | pmaddwd mm3,mm7 |
||
288 | |||
289 | pmaddwd mm1,mm7 |
||
290 | pand mm2,mm6 |
||
291 | |||
292 | .L1: |
||
293 | movq mm4,[esi+24] |
||
294 | pand mm0,mm6 |
||
295 | |||
296 | movq mm5,[esi+16] |
||
297 | por mm3,mm2 |
||
298 | |||
299 | psrld mm3,6 |
||
300 | por mm1,mm0 |
||
301 | |||
302 | movq mm0,mm4 |
||
303 | psrld mm1,6 |
||
304 | |||
305 | pand mm0,qword [mmx32_rgb555_rb] |
||
306 | packssdw mm1,mm3 |
||
307 | |||
308 | movq mm3,mm5 |
||
309 | pmaddwd mm0,mm7 |
||
310 | |||
311 | pand mm3,qword [mmx32_rgb555_rb] |
||
312 | pand mm4,mm6 |
||
313 | |||
314 | movq [edi],mm1 |
||
315 | pmaddwd mm3,mm7 |
||
316 | |||
317 | add esi,BYTE 32 |
||
318 | por mm4,mm0 |
||
319 | |||
320 | pand mm5,mm6 |
||
321 | psrld mm4,6 |
||
322 | |||
323 | movq mm2,[esi+8] |
||
324 | por mm5,mm3 |
||
325 | |||
326 | movq mm0,[esi] |
||
327 | psrld mm5,6 |
||
328 | |||
329 | movq mm3,mm2 |
||
330 | movq mm1,mm0 |
||
331 | |||
332 | pand mm3,qword [mmx32_rgb555_rb] |
||
333 | packssdw mm5,mm4 |
||
334 | |||
335 | pand mm1,qword [mmx32_rgb555_rb] |
||
336 | pand mm2,mm6 |
||
337 | |||
338 | movq [edi+8],mm5 |
||
339 | pmaddwd mm3,mm7 |
||
340 | |||
341 | pmaddwd mm1,mm7 |
||
342 | add edi,BYTE 16 |
||
343 | |||
344 | sub ecx,BYTE 8 |
||
345 | jz .L2 |
||
346 | jmp .L1 |
||
347 | |||
348 | |||
349 | .L2: |
||
350 | mov ecx,edx |
||
351 | |||
352 | and ecx,BYTE 7 |
||
353 | jz .L4 |
||
354 | |||
355 | .L3: |
||
356 | mov ebx,[esi] |
||
357 | add esi,BYTE 4 |
||
358 | |||
359 | mov eax,ebx |
||
360 | mov edx,ebx |
||
361 | |||
362 | shr eax,3 |
||
363 | shr edx,6 |
||
364 | |||
365 | and eax,BYTE 0000000000011111b |
||
366 | and edx, 0000001111100000b |
||
367 | |||
368 | shr ebx,9 |
||
369 | |||
370 | or eax,edx |
||
371 | |||
372 | and ebx, 0111110000000000b |
||
373 | |||
374 | or eax,ebx |
||
375 | |||
376 | mov [edi],ax |
||
377 | add edi,BYTE 2 |
||
378 | |||
379 | dec ecx |
||
380 | jnz .L3 |
||
381 | |||
382 | .L4: |
||
383 | jmp _mmxreturn |
||
384 |