Subversion Repositories Kolibri OS

Compare Revisions

No changes between revisions

Regard whitespace Rev 8209 → Rev 8210

/contrib/sdk/sources/SDL-1.2.2_newlib/src/hermes/mmxp2_32.asm
0,0 → 1,386
;
; pII-optimised MMX format converters for HERMES
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
; This source code is licensed under the GNU LGPL
;
; Please refer to the file COPYING.LIB contained in the distribution for
; licensing conditions
;
; COPYRIGHT NOTICE
;
; This file partly contains code that is (c) Intel Corporation, specifically
; the mode detection routine, and the converter to 15 bit (8 pixel
; conversion routine from the mmx programming tutorial pages).
;
;
; These routines aren't exactly pII optimised - it's just that as they
; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to
; optimise them for p5 MMXs..
 
BITS 32
 
GLOBAL _ConvertMMXpII32_24RGB888
GLOBAL _ConvertMMXpII32_16RGB565
GLOBAL _ConvertMMXpII32_16BGR565
GLOBAL _ConvertMMXpII32_16RGB555
GLOBAL _ConvertMMXpII32_16BGR555
 
EXTERN _mmxreturn
SECTION .data
ALIGN 8
 
;; Constants for conversion routines
 
mmx32_rgb888_mask dd 00ffffffh,00ffffffh
 
mmx32_rgb565_b dd 000000f8h, 000000f8h
mmx32_rgb565_g dd 0000fc00h, 0000fc00h
mmx32_rgb565_r dd 00f80000h, 00f80000h
 
mmx32_rgb555_rb dd 00f800f8h,00f800f8h
mmx32_rgb555_g dd 0000f800h,0000f800h
mmx32_rgb555_mul dd 20000008h,20000008h
mmx32_bgr555_mul dd 00082000h,00082000h
 
 
SECTION .text
 
_ConvertMMXpII32_24RGB888:
 
; set up mm6 as the mask, mm7 as zero
movq mm6, qword [mmx32_rgb888_mask]
pxor mm7, mm7
 
mov edx, ecx ; save ecx
and ecx, 0fffffffch ; clear lower two bits
jnz .L1
jmp .L2
 
.L1:
 
movq mm0, [esi] ; A R G B a r g b
pand mm0, mm6 ; 0 R G B 0 r g b
movq mm1, [esi+8] ; A R G B a r g b
pand mm1, mm6 ; 0 R G B 0 r g b
 
movq mm2, mm0 ; 0 R G B 0 r g b
punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
psllq mm2, 24 ; 0 0 R G B 0 0 0
por mm0, mm2 ; 0 0 R G B r g b
 
movq mm3, mm1 ; 0 R G B 0 r g b
psllq mm3, 48 ; g b 0 0 0 0 0 0
por mm0, mm3 ; g b R G B r g b
 
movq mm4, mm1 ; 0 R G B 0 r g b
punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
psrlq mm1, 16 ; 0 0 0 R G B 0 r
psllq mm4, 8 ; 0 0 0 0 R G B 0
por mm1, mm4 ; 0 0 0 0 R G B r
 
movq [edi], mm0
add esi, BYTE 16
movd [edi+8], mm1
add edi, BYTE 12
sub ecx, BYTE 4
jnz .L1
 
.L2:
mov ecx, edx
and ecx, BYTE 3
jz .L4
.L3:
mov al, [esi]
mov bl, [esi+1]
mov dl, [esi+2]
mov [edi], al
mov [edi+1], bl
mov [edi+2], dl
add esi, BYTE 4
add edi, BYTE 3
dec ecx
jnz .L3
.L4:
jmp _mmxreturn
 
 
 
_ConvertMMXpII32_16RGB565:
 
; set up masks
movq mm5, [mmx32_rgb565_b]
movq mm6, [mmx32_rgb565_g]
movq mm7, [mmx32_rgb565_r]
 
mov edx, ecx
shr ecx, 2
jnz .L1
jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
 
.L1:
movq mm0, [esi] ; argb
movq mm1, mm0 ; argb
pand mm0, mm6 ; 00g0
movq mm3, mm1 ; argb
pand mm1, mm5 ; 000b
pand mm3, mm7 ; 0r00
pslld mm1, 2 ; 0 0 000000bb bbb00000
por mm0, mm1 ; 0 0 ggggggbb bbb00000
psrld mm0, 5 ; 0 0 00000ggg gggbbbbb
 
movq mm4, [esi+8] ; argb
movq mm2, mm4 ; argb
pand mm4, mm6 ; 00g0
movq mm1, mm2 ; argb
pand mm2, mm5 ; 000b
pand mm1, mm7 ; 0r00
pslld mm2, 2 ; 0 0 000000bb bbb00000
por mm4, mm2 ; 0 0 ggggggbb bbb00000
psrld mm4, 5 ; 0 0 00000ggg gggbbbbb
 
packuswb mm3, mm1 ; R 0 r 0
packssdw mm0, mm4 ; as above.. ish
por mm0, mm3 ; done.
movq [edi], mm0
 
add esi, 16
add edi, 8
dec ecx
jnz .L1
 
.L2:
mov ecx, edx
and ecx, BYTE 3
jz .L4
.L3:
mov al, [esi]
mov bh, [esi+1]
mov ah, [esi+2]
shr al, 3
and eax, 0F81Fh ; BYTE?
shr ebx, 5
and ebx, 07E0h ; BYTE?
add eax, ebx
mov [edi], al
mov [edi+1], ah
add esi, BYTE 4
add edi, BYTE 2
dec ecx
jnz .L3
 
.L4:
jmp _mmxreturn
 
_ConvertMMXpII32_16BGR565:
 
movq mm5, [mmx32_rgb565_r]
movq mm6, [mmx32_rgb565_g]
movq mm7, [mmx32_rgb565_b]
 
mov edx, ecx
shr ecx, 2
jnz .L1
jmp .L2
 
.L1:
movq mm0, [esi] ; a r g b
movq mm1, mm0 ; a r g b
pand mm0, mm6 ; 0 0 g 0
movq mm3, mm1 ; a r g b
pand mm1, mm5 ; 0 r 0 0
pand mm3, mm7 ; 0 0 0 b
 
psllq mm3, 16 ; 0 b 0 0
psrld mm1, 14 ; 0 0 000000rr rrr00000
por mm0, mm1 ; 0 0 ggggggrr rrr00000
psrld mm0, 5 ; 0 0 00000ggg gggrrrrr
 
movq mm4, [esi+8] ; a r g b
movq mm2, mm4 ; a r g b
pand mm4, mm6 ; 0 0 g 0
movq mm1, mm2 ; a r g b
pand mm2, mm5 ; 0 r 0 0
pand mm1, mm7 ; 0 0 0 b
 
psllq mm1, 16 ; 0 b 0 0
psrld mm2, 14 ; 0 0 000000rr rrr00000
por mm4, mm2 ; 0 0 ggggggrr rrr00000
psrld mm4, 5 ; 0 0 00000ggg gggrrrrr
 
packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
movq [edi], mm0
 
add esi, BYTE 16
add edi, BYTE 8
dec ecx
jnz .L1
 
.L2:
and edx, BYTE 3
jz .L4
.L3:
mov al, [esi+2]
mov bh, [esi+1]
mov ah, [esi]
shr al, 3
and eax, 0F81Fh ; BYTE ?
shr ebx, 5
and ebx, 07E0h ; BYTE ?
add eax, ebx
mov [edi], al
mov [edi+1], ah
add esi, BYTE 4
add edi, BYTE 2
dec edx
jnz .L3
 
.L4:
jmp _mmxreturn
 
_ConvertMMXpII32_16BGR555:
 
; the 16BGR555 converter is identical to the RGB555 one,
; except it uses a different multiplier for the pmaddwd
; instruction. cool huh.
 
movq mm7, qword [mmx32_bgr555_mul]
jmp _convert_bgr555_cheat
 
; This is the same as the Intel version.. they obviously went to
; much more trouble to expand/coil the loop than I did, so theirs
; would almost certainly be faster, even if only a little.
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
; (I think) a more accurate name..
_ConvertMMXpII32_16RGB555:
 
movq mm7,qword [mmx32_rgb555_mul]
_convert_bgr555_cheat:
movq mm6,qword [mmx32_rgb555_g]
mov edx,ecx ; Save ecx
 
and ecx,BYTE 0fffffff8h ; clear lower three bits
jnz .L_OK
jmp .L2
 
.L_OK:
movq mm2,[esi+8]
 
movq mm0,[esi]
movq mm3,mm2
 
pand mm3,qword [mmx32_rgb555_rb]
movq mm1,mm0
 
pand mm1,qword [mmx32_rgb555_rb]
pmaddwd mm3,mm7
 
pmaddwd mm1,mm7
pand mm2,mm6
 
.L1:
movq mm4,[esi+24]
pand mm0,mm6
 
movq mm5,[esi+16]
por mm3,mm2
 
psrld mm3,6
por mm1,mm0
 
movq mm0,mm4
psrld mm1,6
 
pand mm0,qword [mmx32_rgb555_rb]
packssdw mm1,mm3
 
movq mm3,mm5
pmaddwd mm0,mm7
 
pand mm3,qword [mmx32_rgb555_rb]
pand mm4,mm6
 
movq [edi],mm1
pmaddwd mm3,mm7
 
add esi,BYTE 32
por mm4,mm0
 
pand mm5,mm6
psrld mm4,6
 
movq mm2,[esi+8]
por mm5,mm3
 
movq mm0,[esi]
psrld mm5,6
 
movq mm3,mm2
movq mm1,mm0
 
pand mm3,qword [mmx32_rgb555_rb]
packssdw mm5,mm4
 
pand mm1,qword [mmx32_rgb555_rb]
pand mm2,mm6
 
movq [edi+8],mm5
pmaddwd mm3,mm7
 
pmaddwd mm1,mm7
add edi,BYTE 16
sub ecx,BYTE 8
jz .L2
jmp .L1
 
 
.L2:
mov ecx,edx
and ecx,BYTE 7
jz .L4
.L3:
mov ebx,[esi]
add esi,BYTE 4
mov eax,ebx
mov edx,ebx
 
shr eax,3
shr edx,6
 
and eax,BYTE 0000000000011111b
and edx, 0000001111100000b
 
shr ebx,9
 
or eax,edx
 
and ebx, 0111110000000000b
 
or eax,ebx
 
mov [edi],ax
add edi,BYTE 2
 
dec ecx
jnz .L3
 
.L4:
jmp _mmxreturn
 
 
 
Property changes:
Added: svn:executable
+*
\ No newline at end of property