/kernel/trunk/core/fpu.inc |
---|
13,26 → 13,35 |
fninit |
bt [cpu_caps+(CAPS_XSAVE/32)*4], CAPS_XSAVE mod 32 |
jmp .no_xsave ; not ready to be jnc so far |
jnc .no_xsave |
mov ecx, cr4 |
or ecx, CR4_OSXSAVE |
mov cr4, ecx |
; don't call cpuid again |
bts [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 |
mov eax, 0x0d |
; zero xsave header |
mov ecx, 64/4 |
xor eax, eax |
mov edi, fpu_data + 512 ; skip legacy region |
rep stosd |
mov eax, 0x0d ; extended state enumeration main leaf |
xor ecx, ecx |
cpuid |
mov ebx, XCR0_FPU_MMX + XCR0_SSE + XCR0_AVX + XCR0_AVX512 |
and ebx, eax |
and eax, XCR0_FPU_MMX + XCR0_SSE + XCR0_AVX + XCR0_AVX512 |
xor edx, edx |
mov [xsave_eax], eax |
mov [xsave_edx], edx |
xor ecx, ecx |
xgetbv |
or eax, ebx |
xor ecx, ecx |
xsetbv |
mov eax, 0x0d |
xor ecx, ecx |
cpuid |
add ebx, 63 |
and ebx, NOT 63 |
mov [xsave_area_size], ebx |
cmp ebx, fpu_data_size |
ja $ |
40,6 → 49,8 |
test eax, XCR0_AVX512 |
jz @f |
call init_avx512 |
mov eax, [xsave_eax] |
mov edx, [xsave_edx] |
xsave [fpu_data] |
ret |
@@: |
46,12 → 57,18 |
test eax, XCR0_AVX |
jz @f |
call init_avx |
mov eax, [xsave_eax] |
mov edx, [xsave_edx] |
xsave [fpu_data] |
ret |
@@: |
test eax, XCR0_SSE |
jnz .sse |
jmp .fpu_mmx |
jz $ |
call init_sse |
mov eax, [xsave_eax] |
mov edx, [xsave_edx] |
xsave [fpu_data] |
ret |
.no_xsave: |
mov [xsave_area_size], 512 ; enough for FPU/MMX and SSE |
bt [cpu_caps], CAPS_SSE |
186,6 → 203,7 |
; param |
; eax= avx_save_size() bytes memory area aligned on a 64-byte boundary |
align 4 |
avx_save: |
push ecx |
push esi |
230,7 → 248,12 |
save_context: |
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 |
jnc save_fpu_context |
xsave [eax] |
push eax edx |
mov ecx, eax |
mov eax, [xsave_eax] |
mov edx, [xsave_edx] |
xsave [ecx] |
pop edx eax |
ret |
save_fpu_context: |
bt [cpu_caps], CAPS_SSE |
284,6 → 307,7 |
pop ecx |
ret |
align 4 |
avx_restore: |
push ecx |
push esi |
301,7 → 325,11 |
clts |
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 |
jnc .no_xsave |
push edx |
mov eax, [xsave_eax] |
mov edx, [xsave_edx] |
xrstor [esi] |
pop edx |
popfd |
pop esi |
pop ecx |
351,12 → 379,15 |
mov eax, [ebx+SLOT_BASE+APPDATA.fpu_state] |
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 |
jnc .no_xsave |
xsave [eax] |
mov ecx, eax |
mov eax, [xsave_eax] |
mov edx, [xsave_edx] |
xsave [ecx] |
mov ebx, [CURRENT_TASK] |
mov [fpu_owner], ebx |
shl ebx, 8 |
mov eax, [ebx+SLOT_BASE+APPDATA.fpu_state] |
xrstor [eax] |
mov ecx, [ebx+SLOT_BASE+APPDATA.fpu_state] |
xrstor [ecx] |
.exit: |
restore_ring3_context |
iret |
/kernel/trunk/core/sched.inc |
---|
132,6 → 132,20 |
; set gs selector unconditionally |
Mov ax, graph_data |
Mov gs, ax |
; TS flag is not triggered by AVX* instructions, therefore |
; we have to xsave/xrstor SIMD registers each task change |
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 |
jnc .no_xsave |
mov ecx, [esi+APPDATA.fpu_state] |
mov eax, [xsave_eax] |
mov edx, [xsave_edx] |
xsave [ecx] |
mov ecx, [CURRENT_TASK] |
mov [fpu_owner], ecx |
mov ecx, [current_slot] |
mov ecx, [ecx+APPDATA.fpu_state] |
xrstor [ecx] |
.no_xsave: |
; set CR0.TS |
cmp bh, byte[fpu_owner] ;bh == incoming task (new) |
clts ;clear a task switch flag |
/kernel/trunk/core/taskman.inc |
---|
929,10 → 929,8 |
shr ecx, 2 |
rep movsd |
cmp ebx, [TASK_COUNT] |
jle .noinc |
inc dword [TASK_COUNT] ;update number of processes |
.noinc: |
cmp [TASK_COUNT], ebx |
adc dword [TASK_COUNT], 0 ; update number of processes |
shl ebx, 8 |
lea edx, [ebx+SLOT_BASE+APP_EV_OFFSET] |
mov [SLOT_BASE+APPDATA.fd_ev+ebx], edx |
/kernel/trunk/data32.inc |
---|
335,8 → 335,9 |
align 16 |
cur_saved_data: |
rb 4096 |
align 64 |
fpu_data: |
rb 1024 |
rb 0xa80 ; bochs avx512 |
fpu_data_size = $ - fpu_data |
draw_data: |
rb 32*256 |
434,6 → 435,8 |
cpu_caps rd 4 |
xsave_area_size dd ? |
xsave_eax dd ? |
xsave_edx dd ? |
pg_data PG_DATA |
heap_test dd ? |
/kernel/trunk/kernel.asm |
---|
434,7 → 434,10 |
;lidt [idtreg] |
call init_kernel_heap |
stdcall kernel_alloc, (RING0_STACK_SIZE+512) * 2 |
call init_fpu |
mov eax, [xsave_area_size] |
lea eax, [eax*2 + RING0_STACK_SIZE*2] |
stdcall kernel_alloc, eax |
mov [os_stack_seg], eax |
lea esp, [eax+RING0_STACK_SIZE] |
469,7 → 472,6 |
mov [LFBAddress], LFB_BASE |
mov ecx, bios_fb |
call set_framebuffer |
call init_fpu |
call init_malloc |
stdcall alloc_kernel_space, 0x50000 ; FIXME check size |
590,7 → 592,8 |
mov edx, SLOT_BASE+256*1 |
mov ebx, [os_stack_seg] |
add ebx, 0x2000 |
add ebx, RING0_STACK_SIZE |
add ebx, [xsave_area_size] |
call setup_os_slot |
mov dword [edx], 'IDLE' |
sub [edx+APPDATA.saved_esp], 4 |
/programs/demos/firework/trunk/firework.asm |
---|
7,7 → 7,8 |
; Optimized for KolibriOS, By Diamond |
; Assemble with |
; c:fasm firework.asm firework.kex |
; NOTE: Needs MMX & SSE, optionally AVX |
; NOTE: Needs MMX & SSE, |
; optionally AVX, AVX2, AVX512 |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
use32 |
org 0x0 |
23,8 → 24,12 |
include '../../../macros.inc' |
SCREEN_WIDTH = 320 |
SCREEN_HEIGHT = 200 |
SIMD equ SSE |
SIMD_BYTES = 8 |
SIMD equ AVX |
SIMD_BYTES = 16 |
; SSE 8 |
; AVX 16 |
; AVX2 32 |
; AVX512 64 |
assert SCREEN_WIDTH mod SIMD_BYTES = 0 |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
; Global defines |