0,0 → 1,386 |
; |
; pII-optimised MMX format converters for HERMES |
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) |
; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) |
; This source code is licensed under the GNU LGPL |
; |
; Please refer to the file COPYING.LIB contained in the distribution for |
; licensing conditions |
; |
; COPYRIGHT NOTICE |
; |
; This file partly contains code that is (c) Intel Corporation, specifically |
; the mode detection routine, and the converter to 15 bit (8 pixel |
; conversion routine from the mmx programming tutorial pages). |
; |
; |
; These routines aren't exactly pII optimised - it's just that as they |
; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to |
; optimise them for p5 MMXs.. |
|
BITS 32 |
|
|
GLOBAL _ConvertMMXpII32_24RGB888 |
GLOBAL _ConvertMMXpII32_16RGB565 |
GLOBAL _ConvertMMXpII32_16BGR565 |
GLOBAL _ConvertMMXpII32_16RGB555 |
GLOBAL _ConvertMMXpII32_16BGR555 |
|
EXTERN _mmxreturn |
|
SECTION .data |
|
ALIGN 8 |
|
;; Constants for conversion routines |
|
mmx32_rgb888_mask dd 00ffffffh,00ffffffh |
|
mmx32_rgb565_b dd 000000f8h, 000000f8h |
mmx32_rgb565_g dd 0000fc00h, 0000fc00h |
mmx32_rgb565_r dd 00f80000h, 00f80000h |
|
mmx32_rgb555_rb dd 00f800f8h,00f800f8h |
mmx32_rgb555_g dd 0000f800h,0000f800h |
mmx32_rgb555_mul dd 20000008h,20000008h |
mmx32_bgr555_mul dd 00082000h,00082000h |
|
|
|
SECTION .text |
|
_ConvertMMXpII32_24RGB888: |
|
; set up mm6 as the mask, mm7 as zero |
movq mm6, qword [mmx32_rgb888_mask] |
pxor mm7, mm7 |
|
mov edx, ecx ; save ecx |
and ecx, 0fffffffch ; clear lower two bits |
jnz .L1 |
jmp .L2 |
|
.L1: |
|
movq mm0, [esi] ; A R G B a r g b |
pand mm0, mm6 ; 0 R G B 0 r g b |
movq mm1, [esi+8] ; A R G B a r g b |
pand mm1, mm6 ; 0 R G B 0 r g b |
|
movq mm2, mm0 ; 0 R G B 0 r g b |
punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B |
punpckldq mm0, mm7 ; 0 0 0 0 0 r g b |
psllq mm2, 24 ; 0 0 R G B 0 0 0 |
por mm0, mm2 ; 0 0 R G B r g b |
|
movq mm3, mm1 ; 0 R G B 0 r g b |
psllq mm3, 48 ; g b 0 0 0 0 0 0 |
por mm0, mm3 ; g b R G B r g b |
|
movq mm4, mm1 ; 0 R G B 0 r g b |
punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B |
punpckldq mm1, mm7 ; 0 0 0 0 0 r g b |
psrlq mm1, 16 ; 0 0 0 R G B 0 r |
psllq mm4, 8 ; 0 0 0 0 R G B 0 |
por mm1, mm4 ; 0 0 0 0 R G B r |
|
movq [edi], mm0 |
add esi, BYTE 16 |
movd [edi+8], mm1 |
add edi, BYTE 12 |
sub ecx, BYTE 4 |
jnz .L1 |
|
.L2: |
mov ecx, edx |
and ecx, BYTE 3 |
jz .L4 |
.L3: |
mov al, [esi] |
mov bl, [esi+1] |
mov dl, [esi+2] |
mov [edi], al |
mov [edi+1], bl |
mov [edi+2], dl |
add esi, BYTE 4 |
add edi, BYTE 3 |
dec ecx |
jnz .L3 |
.L4: |
jmp _mmxreturn |
|
|
|
_ConvertMMXpII32_16RGB565: |
|
; set up masks |
movq mm5, [mmx32_rgb565_b] |
movq mm6, [mmx32_rgb565_g] |
movq mm7, [mmx32_rgb565_r] |
|
mov edx, ecx |
shr ecx, 2 |
jnz .L1 |
jmp .L2 ; not necessary at the moment, but doesn't hurt (much) |
|
.L1: |
movq mm0, [esi] ; argb |
movq mm1, mm0 ; argb |
pand mm0, mm6 ; 00g0 |
movq mm3, mm1 ; argb |
pand mm1, mm5 ; 000b |
pand mm3, mm7 ; 0r00 |
pslld mm1, 2 ; 0 0 000000bb bbb00000 |
por mm0, mm1 ; 0 0 ggggggbb bbb00000 |
psrld mm0, 5 ; 0 0 00000ggg gggbbbbb |
|
movq mm4, [esi+8] ; argb |
movq mm2, mm4 ; argb |
pand mm4, mm6 ; 00g0 |
movq mm1, mm2 ; argb |
pand mm2, mm5 ; 000b |
pand mm1, mm7 ; 0r00 |
pslld mm2, 2 ; 0 0 000000bb bbb00000 |
por mm4, mm2 ; 0 0 ggggggbb bbb00000 |
psrld mm4, 5 ; 0 0 00000ggg gggbbbbb |
|
packuswb mm3, mm1 ; R 0 r 0 |
packssdw mm0, mm4 ; as above.. ish |
por mm0, mm3 ; done. |
movq [edi], mm0 |
|
add esi, 16 |
add edi, 8 |
dec ecx |
jnz .L1 |
|
.L2: |
mov ecx, edx |
and ecx, BYTE 3 |
jz .L4 |
.L3: |
mov al, [esi] |
mov bh, [esi+1] |
mov ah, [esi+2] |
shr al, 3 |
and eax, 0F81Fh ; BYTE? |
shr ebx, 5 |
and ebx, 07E0h ; BYTE? |
add eax, ebx |
mov [edi], al |
mov [edi+1], ah |
add esi, BYTE 4 |
add edi, BYTE 2 |
dec ecx |
jnz .L3 |
|
.L4: |
jmp _mmxreturn |
|
|
_ConvertMMXpII32_16BGR565: |
|
movq mm5, [mmx32_rgb565_r] |
movq mm6, [mmx32_rgb565_g] |
movq mm7, [mmx32_rgb565_b] |
|
mov edx, ecx |
shr ecx, 2 |
jnz .L1 |
jmp .L2 |
|
.L1: |
movq mm0, [esi] ; a r g b |
movq mm1, mm0 ; a r g b |
pand mm0, mm6 ; 0 0 g 0 |
movq mm3, mm1 ; a r g b |
pand mm1, mm5 ; 0 r 0 0 |
pand mm3, mm7 ; 0 0 0 b |
|
psllq mm3, 16 ; 0 b 0 0 |
psrld mm1, 14 ; 0 0 000000rr rrr00000 |
por mm0, mm1 ; 0 0 ggggggrr rrr00000 |
psrld mm0, 5 ; 0 0 00000ggg gggrrrrr |
|
movq mm4, [esi+8] ; a r g b |
movq mm2, mm4 ; a r g b |
pand mm4, mm6 ; 0 0 g 0 |
movq mm1, mm2 ; a r g b |
pand mm2, mm5 ; 0 r 0 0 |
pand mm1, mm7 ; 0 0 0 b |
|
psllq mm1, 16 ; 0 b 0 0 |
psrld mm2, 14 ; 0 0 000000rr rrr00000 |
por mm4, mm2 ; 0 0 ggggggrr rrr00000 |
psrld mm4, 5 ; 0 0 00000ggg gggrrrrr |
|
packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 |
packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR |
por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr |
movq [edi], mm0 |
|
add esi, BYTE 16 |
add edi, BYTE 8 |
dec ecx |
jnz .L1 |
|
.L2: |
and edx, BYTE 3 |
jz .L4 |
.L3: |
mov al, [esi+2] |
mov bh, [esi+1] |
mov ah, [esi] |
shr al, 3 |
and eax, 0F81Fh ; BYTE ? |
shr ebx, 5 |
and ebx, 07E0h ; BYTE ? |
add eax, ebx |
mov [edi], al |
mov [edi+1], ah |
add esi, BYTE 4 |
add edi, BYTE 2 |
dec edx |
jnz .L3 |
|
.L4: |
jmp _mmxreturn |
|
_ConvertMMXpII32_16BGR555: |
|
; the 16BGR555 converter is identical to the RGB555 one, |
; except it uses a different multiplier for the pmaddwd |
; instruction. cool huh. |
|
movq mm7, qword [mmx32_bgr555_mul] |
jmp _convert_bgr555_cheat |
|
; This is the same as the Intel version.. they obviously went to |
; much more trouble to expand/coil the loop than I did, so theirs |
; would almost certainly be faster, even if only a little. |
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is |
; (I think) a more accurate name.. |
_ConvertMMXpII32_16RGB555: |
|
movq mm7,qword [mmx32_rgb555_mul] |
_convert_bgr555_cheat: |
movq mm6,qword [mmx32_rgb555_g] |
|
mov edx,ecx ; Save ecx |
|
and ecx,BYTE 0fffffff8h ; clear lower three bits |
jnz .L_OK |
jmp .L2 |
|
.L_OK: |
|
movq mm2,[esi+8] |
|
movq mm0,[esi] |
movq mm3,mm2 |
|
pand mm3,qword [mmx32_rgb555_rb] |
movq mm1,mm0 |
|
pand mm1,qword [mmx32_rgb555_rb] |
pmaddwd mm3,mm7 |
|
pmaddwd mm1,mm7 |
pand mm2,mm6 |
|
.L1: |
movq mm4,[esi+24] |
pand mm0,mm6 |
|
movq mm5,[esi+16] |
por mm3,mm2 |
|
psrld mm3,6 |
por mm1,mm0 |
|
movq mm0,mm4 |
psrld mm1,6 |
|
pand mm0,qword [mmx32_rgb555_rb] |
packssdw mm1,mm3 |
|
movq mm3,mm5 |
pmaddwd mm0,mm7 |
|
pand mm3,qword [mmx32_rgb555_rb] |
pand mm4,mm6 |
|
movq [edi],mm1 |
pmaddwd mm3,mm7 |
|
add esi,BYTE 32 |
por mm4,mm0 |
|
pand mm5,mm6 |
psrld mm4,6 |
|
movq mm2,[esi+8] |
por mm5,mm3 |
|
movq mm0,[esi] |
psrld mm5,6 |
|
movq mm3,mm2 |
movq mm1,mm0 |
|
pand mm3,qword [mmx32_rgb555_rb] |
packssdw mm5,mm4 |
|
pand mm1,qword [mmx32_rgb555_rb] |
pand mm2,mm6 |
|
movq [edi+8],mm5 |
pmaddwd mm3,mm7 |
|
pmaddwd mm1,mm7 |
add edi,BYTE 16 |
|
sub ecx,BYTE 8 |
jz .L2 |
jmp .L1 |
|
|
.L2: |
mov ecx,edx |
|
and ecx,BYTE 7 |
jz .L4 |
|
.L3: |
mov ebx,[esi] |
add esi,BYTE 4 |
|
mov eax,ebx |
mov edx,ebx |
|
shr eax,3 |
shr edx,6 |
|
and eax,BYTE 0000000000011111b |
and edx, 0000001111100000b |
|
shr ebx,9 |
|
or eax,edx |
|
and ebx, 0111110000000000b |
|
or eax,ebx |
|
mov [edi],ax |
add edi,BYTE 2 |
|
dec ecx |
jnz .L3 |
|
.L4: |
jmp _mmxreturn |
|
|
|
Property changes: |
Added: svn:executable |
+* |
\ No newline at end of property |