Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 292 → Rev 293

/kernel/trunk/drivers/infinity.asm
19,18 → 19,22
include 'main.inc'
include 'imports.inc'
 
USE_MMX equ 0
USE_MMX_128 equ 0
USE_SSE equ 0
FORCE_MMX equ 0 ;set to 1 to force use mmx or
FORCE_MMX_128 equ 0 ;integer sse2 extensions
;and reduce driver size
;USE_SSE equ 0
 
DEBUG equ 1
 
EVENT_NOTIFY equ 0x00000200
 
OS_BASE equ 0; 0x80400000
new_app_base equ 0x60400000; 0x01000000
OS_BASE equ 0
new_app_base equ 0x60400000
PROC_BASE equ OS_BASE+0x0080000
 
CAPS_SSE2 equ 26
 
 
public START
public service_proc
public version
79,6 → 83,42
mov [str.fd], eax
mov [str.bk], eax
 
if FORCE_MMX
if FORCE_MMX_128
display 'Use only FORCE_MMX or FORCE_MMX_128 not both together',13,10
stop
end if
mov [mix_2_core], mmx_mix_2
mov [mix_3_core], mmx_mix_3
mov [mix_4_core], mmx_mix_4
end if
 
if FORCE_MMX_128
if FORCE_MMX
display 'Use only FORCE_MMX or FORCE_MMX_128 not both together',13,10
stop
end if
mov [mix_2_core], mmx128_mix_2
mov [mix_3_core], mmx128_mix_3
mov [mix_4_core], mmx128_mix_4
end if
 
if ~(FORCE_MMX or FORCE_MMX_128) ;autodetect
mov eax, 1
cpuid
bt edx, CAPS_SSE2
jc .mmx128
;old 64-bit mmx
mov [mix_2_core], mmx_mix_2
mov [mix_3_core], mmx_mix_3
mov [mix_4_core], mmx_mix_4
jmp @F
.mmx128: ;new 128-bit sse2 extensions
mov [mix_2_core], mmx128_mix_2
mov [mix_3_core], mmx128_mix_3
mov [mix_4_core], mmx128_mix_4
@@:
end if
stdcall set_handler, [hSound], new_mix
stdcall RegService, szInfinity, service_proc
ret
563,14 → 603,8
endp
 
include 'mixer.asm'
 
;if USE_MMX
; include 'mix_mmx.inc'
;end if
 
if USE_MMX_128
include 'mix_mmx.inc'
include 'mix_sse2.inc'
end if
 
;if USE_SSE
; include 'mix_sse.inc'
664,7 → 698,7
str.fd rd 1
str.bk rd 1
 
mix_2_1.core rd 1
mix_3_1.core rd 1
mix_4_1.core rd 1
mix_2_core rd 1
mix_3_core rd 1
mix_4_core rd 1
 
/kernel/trunk/drivers/mix_mmx.inc
0,0 → 1,241
 
; params
; edi= output
; eax= input stream 1
; ebx= input stream 2
 
if used mmx_mix_2
 
align 4
mmx_mix_2:
movq mm0, [eax]
movq mm1, [eax+8]
movq mm2, [eax+16]
movq mm3, [eax+24]
movq mm4, [eax+32]
movq mm5, [eax+40]
movq mm6, [eax+48]
movq mm7, [eax+56]
 
paddsw mm0, [ebx]
movq [edi], mm0
paddsw mm1,[ebx+8]
movq [edi+8], mm1
paddsw mm2, [ebx+16]
movq [edi+16], mm2
paddsw mm3, [ebx+24]
movq [edi+24], mm3
paddsw mm4, [ebx+32]
movq [edi+32], mm4
paddsw mm5, [ebx+40]
movq [edi+40], mm5
paddsw mm6, [ebx+48]
movq [edi+48], mm6
paddsw mm7, [ebx+56]
movq [edi+56], mm7
 
movq mm0, [eax+64]
movq mm1, [eax+72]
movq mm2, [eax+80]
movq mm3, [eax+88]
movq mm4, [eax+96]
movq mm5, [eax+104]
movq mm6, [eax+112]
movq mm7, [eax+120]
 
paddsw mm0, [ebx+64]
movq [edi+64], mm0
paddsw mm1, [ebx+72]
movq [edi+72], mm1
paddsw mm2, [ebx+80]
movq [edi+80], mm2
paddsw mm3, [ebx+88]
movq [edi+88], mm3
paddsw mm4, [ebx+96]
movq [edi+96], mm4
paddsw mm5, [ecx+104]
movq [edx+104], mm5
paddsw mm6, [ebx+112]
movq [edi+112], mm6
paddsw mm7, [ebx+120]
movq [edi+120], mm7
ret
 
align 4
mmx_mix_3:
movq mm0, [eax]
movq mm1, [eax+8]
movq mm2, [eax+16]
movq mm3, [eax+24]
movq mm4, [eax+32]
movq mm5, [eax+40]
movq mm6, [eax+48]
movq mm7, [eax+56]
 
paddsw mm0, [ebx]
paddsw mm1, [ebx+8]
paddsw mm2, [ebx+16]
paddsw mm3, [ebx+24]
paddsw mm4, [ebx+32]
paddsw mm5, [ebx+40]
paddsw mm6, [ebx+48]
paddsw mm7, [ebx+56]
paddsw mm0, [ecx]
movq [edi], mm0
paddsw mm1,[ecx+8]
movq [edi+8], mm1
paddsw mm2, [ecx+16]
movq [edi+16], mm2
paddsw mm3, [ecx+24]
movq [edi+24], mm3
paddsw mm4, [ecx+32]
movq [edi+32], mm4
paddsw mm5, [ecx+40]
movq [edi+40], mm5
paddsw mm6, [ecx+48]
movq [edi+48], mm6
paddsw mm7, [ecx+56]
movq [edi+56], mm7
 
movq mm0, [eax+64]
movq mm1, [eax+72]
movq mm2, [eax+80]
movq mm3, [eax+88]
movq mm4, [eax+96]
movq mm5, [eax+104]
movq mm6, [eax+112]
movq mm7, [eax+120]
paddsw mm0, [ebx+64]
paddsw mm1, [ebx+72]
paddsw mm2, [ebx+80]
paddsw mm3, [ebx+88]
paddsw mm4, [ebx+96]
paddsw mm5, [ebx+104]
paddsw mm6, [ebx+112]
paddsw mm7, [ebx+120]
paddsw mm0, [ecx+64]
movq [edi+64], mm0
paddsw mm1, [ecx+72]
movq [edi+72], mm1
paddsw mm2, [ecx+80]
movq [edi+80], mm2
paddsw mm3, [ecx+88]
movq [edi+88], mm3
paddsw mm4, [ecx+96]
movq [edi+96], mm4
paddsw mm5, [ecx+104]
movq [edi+104], mm5
paddsw mm6, [ecx+112]
movq [edi+112], mm6
paddsw mm7, [ecx+120]
movq [edi+120], mm7
ret
 
align 4
mmx_mix_4:
 
movq mm0, [eax]
movq mm2, [eax+8]
movq mm4, [eax+16]
movq mm6, [eax+24]
movq mm1, [ebx]
movq mm3, [ebx+8]
movq mm5, [ebx+16]
movq mm7, [ebx+24]
paddsw mm0, [ecx]
paddsw mm2, [ecx+8]
paddsw mm4, [ecx+16]
paddsw mm6, [ecx+24]
paddsw mm1, [edx]
paddsw mm3, [edx+8]
paddsw mm5, [edx+16]
paddsw mm7, [edx+24]
 
paddsw mm0, mm1
movq [edi], mm0
paddsw mm2, mm3
movq [edi+8], mm2
paddsw mm4, mm5
movq [edi+16], mm4
paddsw mm5, mm6
movq [edi+24], mm6
 
movq mm0, [eax+32]
movq mm2, [eax+40]
movq mm4, [eax+48]
movq mm6, [eax+56]
movq mm1, [ebx+32]
movq mm3, [ebx+40]
movq mm5, [ebx+48]
movq mm7, [ebx+56]
paddsw mm0, [ecx+32]
paddsw mm2, [ecx+40]
paddsw mm4, [ecx+48]
paddsw mm6, [ecx+56]
paddsw mm1, [edx+32]
paddsw mm3, [edx+40]
paddsw mm5, [edx+48]
paddsw mm7, [edx+56]
 
paddsw mm0, mm1
movq [edi+32], mm0
paddsw mm2, mm2
movq [edi+40], mm2
paddsw mm4, mm5
movq [edi+48], mm4
paddsw mm6, mm7
movq [edi+56], mm6
 
movq mm0, [eax+64]
movq mm2, [eax+72]
movq mm4, [eax+80]
movq mm6, [eax+88]
movq mm1, [ebx+64]
movq mm3, [ebx+72]
movq mm5, [ebx+80]
movq mm7, [ebx+88]
paddsw mm0, [ecx+64]
paddsw mm2, [ecx+72]
paddsw mm4, [ecx+80]
paddsw mm6, [ecx+88]
paddsw mm1, [edx+64]
paddsw mm3, [edx+72]
paddsw mm5, [edx+80]
paddsw mm7, [edx+88]
 
paddsw mm0, mm1
movq [edi+64], mm0
paddsw mm2, mm3
movq [edi+72], mm2
paddsw mm4, mm5
movq [edi+80], mm4
paddsw mm6, mm5
movq [edi+88], mm7
 
movq mm0, [eax+96]
movq mm2, [eax+104]
movq mm4, [eax+112]
movq mm6, [eax+120]
movq mm1, [ebx+96]
movq mm3, [ebx+104]
movq mm5, [ebx+112]
movq mm7, [ebx+120]
paddsw mm0, [ecx+96]
paddsw mm2, [ecx+104]
paddsw mm4, [ecx+112]
paddsw mm6, [ecx+120]
paddsw mm1, [edx+96]
paddsw mm3, [edx+104]
paddsw mm5, [edx+112]
paddsw mm7, [edx+120]
paddsw mm0, mm1
movq [eax+96], mm0
paddsw mm2, mm3
movq [edi+104], mm2
paddsw mm4, mm5
movq [edi+112], mm4
paddsw mm6, mm7
movq [edi+120], mm6
ret
 
end if
/kernel/trunk/drivers/mix_sse2.inc
0,0 → 1,139
 
if used mmx128_mix_2
 
align 4
mmx128_mix_2:
prefetcht1 [eax+128]
prefetcht1 [ebx+128]
 
movaps xmm0, [eax]
movaps xmm1, [eax+16]
movaps xmm2, [eax+32]
movaps xmm3, [eax+48]
movaps xmm4, [eax+64]
movaps xmm5, [eax+80]
movaps xmm6, [eax+96]
movaps xmm7, [eax+112]
 
paddsw xmm0, [ebx]
movaps [edi], xmm0
paddsw xmm1,[ebx+16]
movaps [edi+16], xmm1
paddsw xmm2, [ebx+32]
movaps [edi+32], xmm2
paddsw xmm3, [ebx+48]
movaps [edi+48], xmm3
paddsw xmm4, [ebx+64]
movaps [edi+64], xmm4
paddsw xmm5, [ebx+80]
movaps [edi+80], xmm5
paddsw xmm6, [ebx+96]
movaps [edi+96], xmm6
paddsw xmm7, [ebx+112]
movaps [edi+112], xmm7
ret
 
align 4
mmx128_mix_3:
prefetcht1 [eax+128]
prefetcht1 [ebx+128]
prefetcht1 [ecx+128]
 
movaps xmm0, [eax]
movaps xmm1, [eax+16]
movaps xmm2, [eax+32]
movaps xmm3, [eax+48]
movaps xmm4, [eax+64]
movaps xmm5, [eax+80]
movaps xmm6, [eax+96]
movaps xmm7, [eax+112]
 
paddsw xmm0, [ebx]
paddsw xmm1, [ebx+16]
paddsw xmm2, [ebx+32]
paddsw xmm3, [ebx+48]
paddsw xmm4, [ebx+64]
paddsw xmm5, [ebx+80]
paddsw xmm6, [ebx+96]
paddsw xmm7, [ebx+112]
 
paddsw xmm0, [ecx]
movaps [edi], xmm0
paddsw xmm1, [ecx+16]
movaps [edi+16], xmm1
paddsw xmm2, [ecx+32]
movaps [edi+32], xmm2
paddsw xmm3, [ecx+48]
movaps [edi+48], xmm3
paddsw xmm4, [ecx+64]
movaps [edi+64], xmm4
paddsw xmm5, [ecx+80]
movaps [edi+80], xmm5
paddsw xmm6, [ecx+96]
movaps [edi+96], xmm6
paddsw xmm7, [ecx+112]
movaps [edi+112], xmm7
ret
 
align 4
mmx128_mix_4:
prefetcht1 [eax+128]
prefetcht1 [ebx+128]
prefetcht1 [ecx+128]
prefetcht1 [edx+128]
 
movaps xmm0, [eax]
movaps xmm2, [eax+16]
movaps xmm4, [eax+32]
movaps xmm6, [eax+48]
movaps xmm1, [ebx]
movaps xmm3, [ebx+16]
movaps xmm5, [ebx+32]
movaps xmm7, [ebx+48]
 
paddsw xmm0, [ecx]
paddsw xmm2, [ecx+16]
paddsw xmm4, [ecx+32]
paddsw xmm6, [ecx+48]
paddsw xmm1, [edx]
paddsw xmm3, [edx+16]
paddsw xmm5, [edx+32]
paddsw xmm7, [edx+48]
 
paddsw xmm0, xmm1
movaps [edi], xmm0
paddsw xmm2, xmm3
movaps [edi+16], xmm2
paddsw xmm4, xmm5
movaps [edi+32], xmm4
paddsw xmm6, xmm7
movaps [edi+48], xmm6
 
movaps xmm0, [eax+64]
movaps xmm2, [eax+80]
movaps xmm4, [eax+96]
movaps xmm6, [eax+112]
 
movaps xmm1, [ebx+64]
movaps xmm3, [ebx+80]
movaps xmm5, [ebx+96]
movaps xmm7, [ebx+112]
paddsw xmm0, [ecx+64]
paddsw xmm2, [ecx+80]
paddsw xmm4, [ecx+96]
paddsw xmm6, [ecx+112]
 
paddsw xmm1, [edx+64]
paddsw xmm3, [edx+80]
paddsw xmm5, [edx+96]
paddsw xmm7, [edx+112]
paddsw xmm0, xmm1
movaps [edi+64], xmm0
paddsw xmm2, xmm3
movaps [edi+80], xmm2
paddsw xmm4, xmm5
movaps [edi+96], xmm4
paddsw xmm6, xmm7
movaps [edi+112], xmm6
ret
end if
/kernel/trunk/drivers/mixer.asm
90,7 → 90,7
.m3:
add [output],512
 
sub [main_count], 1
dec [main_count]
jnz .l00
 
call update_stream
622,6 → 622,7
ret
endp
 
align 4
proc m16_s_mmx
 
movq mm0, [esi]
777,56 → 778,59
ret
endp
 
 
align 4
proc mix_2_1 stdcall, output:dword, str0:dword, str1:dword
 
mov edi, [output]
mov eax, [str0]
mov ebx, [str1]
mov esi, 128
call [mix_2_core] ;edi, eax, ebx
 
stdcall mix_2_1_mmx, edi, [str0],[str1]
; stdcall mix_2_1_sse, edi, [str0],[str1]
add edi, 128
add [str0], 128
add [str1], 128
stdcall mix_2_1_mmx, edi, [str0],[str1]
; stdcall mix_2_1_sse, edi, [str0],[str1]
add edi, 128
add [str0], 128
add [str1], 128
stdcall mix_2_1_mmx, edi, [str0],[str1]
; stdcall mix_2_1_sse, edi, [str0],[str1]
add edi, 128
add [str0], 128
add [str1], 128
stdcall mix_2_1_mmx, edi, [str0],[str1]
; stdcall mix_2_1_sse, edi, [str0],[str1]
add edi, esi
add eax, esi
add ebx, esi
call [mix_2_core] ;edi, eax, ebx
 
add edi, esi
add eax, esi
add ebx, esi
call [mix_2_core] ;edi, eax, ebx
 
add edi, esi
add eax, esi
add ebx, esi
call [mix_2_core] ;edi, eax, ebx
ret
endp
 
 
align 4
proc mix_3_1 stdcall, output:dword, str0:dword, str1:dword, str2:dword
 
mov edi, [output]
mov eax, [str0]
mov ebx, [str1]
mov ecx, [str2]
mov esi, 128
call [mix_3_core]
 
stdcall mix_3_1_mmx, edi, [str0],[str1],[str2]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
stdcall mix_3_1_mmx, edi, [str0],[str1],[str2]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
stdcall mix_3_1_mmx, edi, [str0],[str1],[str2]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
stdcall mix_3_1_mmx, edi, [str0],[str1],[str2]
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
call [mix_3_core]
 
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
call [mix_3_core]
 
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
call [mix_3_core]
ret
endp
 
839,29 → 843,35
call alloc_mix_buff
and eax, eax
jz .err
mov [output], eax
 
mov edi, eax
mov eax, [str0]
mov ebx, [str1]
mov ecx, [str2]
mov edx, [str3]
mov esi, 128
call [mix_4_core] ;edi, eax, ebx, ecx, edx
 
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
 
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
 
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
mov eax, [output]
ret
.err:
876,322 → 886,37
 
mov edi, [output]
 
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
 
ret
endp
 
align 4
proc mix_2_1_mmx stdcall, output:dword, str0:dword, str1:dword
 
mov edx, [output]
mov eax, [str0]
mov ecx, [str1]
 
movq mm0, [eax]
paddsw mm0, [ecx]
movq [edx], mm0
 
movq mm1, [eax+8]
paddsw mm1,[ecx+8]
movq [edx+8], mm1
 
movq mm2, [eax+16]
paddsw mm2, [ecx+16]
movq [edx+16], mm2
 
movq mm3, [eax+24]
paddsw mm3, [ecx+24]
movq [edx+24], mm3
 
movq mm0, [eax+32]
paddsw mm0, [ecx+32]
movq [edx+32], mm0
 
movq mm1, [eax+40]
paddsw mm1, [ecx+40]
movq [edx+40], mm1
 
movq mm2, [eax+48]
paddsw mm2, [ecx+48]
movq [edx+48], mm2
 
movq mm3, [eax+56]
paddsw mm3, [ecx+56]
movq [edx+56], mm3
 
movq mm0, [eax+64]
paddsw mm0, [ecx+64]
movq [edx+64], mm0
 
movq mm1, [eax+72]
paddsw mm1, [ecx+72]
movq [edx+72], mm1
 
movq mm2, [eax+80]
paddsw mm2, [ecx+80]
movq [edx+80], mm2
 
movq mm3, [eax+88]
paddsw mm3, [ecx+88]
movq [edx+88], mm3
 
movq mm0, [eax+96]
paddsw mm0, [ecx+96]
movq [edx+96], mm0
 
movq mm1, [eax+104]
paddsw mm1, [ecx+104]
movq [edx+104], mm1
 
movq mm2, [eax+112]
paddsw mm2, [ecx+112]
movq [edx+112], mm2
 
movq mm3, [eax+120]
paddsw mm3, [ecx+120]
movq [edx+120], mm3
 
ret
endp
 
 
 
align 4
proc mix_3_1_mmx stdcall, output:dword, str0:dword, str1:dword, str2:dword
 
mov edx, [output]
mov eax, [str0]
mov ebx, [str1]
mov ecx, [str2]
mov edx, [str3]
mov esi, 128
call [mix_4_core] ;edi, eax, ebx, ecx, edx
 
movq mm0, [eax]
paddsw mm0, [ebx]
paddsw mm0, [ecx]
movq [edx], mm0
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
 
movq mm1, [eax+8]
paddsw mm1,[ebx+8]
paddsw mm1,[ecx+8]
movq [edx+8], mm1
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
 
movq mm2, [eax+16]
paddsw mm2, [ebx+16]
paddsw mm2, [ecx+16]
movq [edx+16], mm2
 
movq mm3, [eax+24]
paddsw mm3, [ebx+24]
paddsw mm3, [ecx+24]
movq [edx+24], mm3
 
movq mm0, [eax+32]
paddsw mm0, [ebx+32]
paddsw mm0, [ecx+32]
movq [edx+32], mm0
 
movq mm1, [eax+40]
paddsw mm1, [ebx+40]
paddsw mm1, [ecx+40]
movq [edx+40], mm1
 
movq mm2, [eax+48]
paddsw mm2, [ebx+48]
paddsw mm2, [ecx+48]
movq [edx+48], mm2
 
movq mm3, [eax+56]
paddsw mm3, [ebx+56]
paddsw mm3, [ecx+56]
movq [edx+56], mm3
 
movq mm0, [eax+64]
paddsw mm0, [ebx+64]
paddsw mm0, [ecx+64]
movq [edx+64], mm0
 
movq mm1, [eax+72]
paddsw mm1, [ebx+72]
paddsw mm1, [ecx+72]
movq [edx+72], mm1
 
movq mm2, [eax+80]
paddsw mm2, [ebx+80]
paddsw mm2, [ecx+80]
movq [edx+80], mm2
 
movq mm3, [eax+88]
paddsw mm3, [ebx+88]
paddsw mm3, [ecx+88]
movq [edx+88], mm3
 
movq mm0, [eax+96]
paddsw mm0, [ebx+96]
paddsw mm0, [ecx+96]
movq [edx+96], mm0
 
movq mm1, [eax+104]
paddsw mm1, [ebx+104]
paddsw mm1, [ecx+104]
movq [edx+104], mm1
 
movq mm2, [eax+112]
paddsw mm2, [ebx+112]
paddsw mm2, [ecx+112]
movq [edx+112], mm2
 
movq mm3, [eax+120]
paddsw mm3, [ebx+120]
paddsw mm3, [ecx+120]
movq [edx+120], mm3
 
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
ret
endp
 
align 4
proc mix_4_1_mmx stdcall, output:dword, str0:dword, str1:dword,\
str2:dword, str3:dword
 
mov edx, [output]
mov esi, [str0]
mov eax, [str1]
mov ebx, [str2]
mov ecx, [str3]
 
movq mm0, [esi]
movq mm1, [eax]
paddsw mm0, [ebx]
paddsw mm1, [ecx]
paddsw mm0, mm1
movq [edx], mm0
 
movq mm2, [esi+8]
movq mm3, [eax+8]
paddsw mm2, [ebx+8]
paddsw mm3, [ecx+8]
paddsw mm2, mm3
movq [edx+8], mm2
 
movq mm0, [esi+16]
movq mm1, [eax+16]
paddsw mm0, [ebx+16]
paddsw mm1, [ecx+16]
paddsw mm0, mm1
movq [edx+16], mm0
 
movq mm2, [esi+24]
movq mm3, [eax+24]
paddsw mm2, [ebx+24]
paddsw mm3, [ecx+24]
paddsw mm2, mm3
movq [edx+24], mm2
 
movq mm0, [esi+32]
movq mm1, [eax+32]
paddsw mm0, [ebx+32]
paddsw mm1, [ecx+32]
paddsw mm0, mm1
movq [edx+32], mm0
 
movq mm2, [esi+40]
movq mm3, [eax+40]
paddsw mm2, [ebx+40]
paddsw mm3, [ecx+40]
paddsw mm2, mm3
movq [edx+40], mm2
 
movq mm0, [esi+48]
movq mm1, [eax+48]
paddsw mm0, [ebx+48]
paddsw mm1, [ecx+48]
paddsw mm0, mm1
movq [edx+48], mm0
 
movq mm2, [esi+56]
movq mm3, [eax+56]
paddsw mm2, [ebx+56]
paddsw mm3, [ecx+56]
paddsw mm2, mm3
movq [edx+56], mm2
 
movq mm0, [esi+64]
movq mm1, [eax+64]
paddsw mm0, [ebx+64]
paddsw mm1, [ecx+64]
paddsw mm0, mm1
movq [edx+64], mm0
 
movq mm2, [esi+72]
movq mm3, [eax+72]
paddsw mm2, [ebx+72]
paddsw mm3, [ecx+72]
paddsw mm2, mm3
movq [edx+72], mm2
 
movq mm2, [esi+80]
movq mm3, [eax+80]
paddsw mm2, [ebx+80]
paddsw mm3, [ecx+80]
paddsw mm2, mm3
movq [edx+80], mm2
 
movq mm2, [esi+88]
movq mm3, [eax+88]
paddsw mm2, [ebx+88]
paddsw mm3, [ecx+88]
paddsw mm2, mm3
movq [edx+88], mm2
 
movq mm2, [esi+96]
movq mm3, [eax+96]
paddsw mm2, [ebx+96]
paddsw mm3, [ecx+96]
paddsw mm2, mm3
movq [edx+96], mm2
 
movq mm2, [esi+104]
movq mm3, [eax+104]
paddsw mm2, [ebx+104]
paddsw mm3, [ecx+104]
paddsw mm2, mm3
movq [edx+104], mm2
 
movq mm2, [esi+112]
movq mm3, [eax+112]
paddsw mm2, [ebx+112]
paddsw mm3, [ecx+112]
paddsw mm2, mm3
movq [edx+112], mm2
 
movq mm2, [esi+120]
movq mm3, [eax+120]
paddsw mm2, [ebx+120]
paddsw mm3, [ecx+120]
paddsw mm2, mm3
movq [edx+120], mm2
 
ret
endp
 
align 4
proc copy_mem stdcall, output:dword, input:dword
 
mov edi, [output]