0,0 → 1,115 |
.section #gm107_builtin_code |
// DIV U32 |
// |
// UNR recurrence (q = a / b): |
// look for z such that 2^32 - b <= b * z < 2^32 |
// then q - 1 <= (a * z) / 2^32 <= q |
// |
// INPUT: $r0: dividend, $r1: divisor |
// OUTPUT: $r0: result, $r1: modulus |
// CLOBBER: $r2 - $r3, $p0 - $p1 |
// SIZE: 22 / 14 * 8 bytes |
// |
gm107_div_u32: |
sched 0x7e0 0x7e0 0x7e0 |
flo u32 $r2 $r1 |
lop xor 1 $r2 $r2 0x1f |
mov $r3 0x1 0xf |
sched 0x7e0 0x7e0 0x7e0 |
shl $r2 $r3 $r2 |
i2i u32 u32 $r1 neg $r1 |
imul u32 u32 $r3 $r1 $r2 |
sched 0x7e0 0x7e0 0x7e0 |
imad u32 u32 hi $r2 $r2 $r3 $r2 |
imul u32 u32 $r3 $r1 $r2 |
imad u32 u32 hi $r2 $r2 $r3 $r2 |
sched 0x7e0 0x7e0 0x7e0 |
imul u32 u32 $r3 $r1 $r2 |
imad u32 u32 hi $r2 $r2 $r3 $r2 |
imul u32 u32 $r3 $r1 $r2 |
sched 0x7e0 0x7e0 0x7e0 |
imad u32 u32 hi $r2 $r2 $r3 $r2 |
imul u32 u32 $r3 $r1 $r2 |
imad u32 u32 hi $r2 $r2 $r3 $r2 |
sched 0x7e0 0x7e0 0x7e0 |
mov $r3 $r0 0xf |
imul u32 u32 hi $r0 $r0 $r2 |
i2i u32 u32 $r2 neg $r1 |
sched 0x7e0 0x7e0 0x7e0 |
imad u32 u32 $r1 $r1 $r0 $r3 |
isetp ge u32 and $p0 1 $r1 $r2 1 |
$p0 iadd $r1 $r1 neg $r2 |
sched 0x7e0 0x7e0 0x7e0 |
$p0 iadd $r0 $r0 0x1 |
$p0 isetp ge u32 and $p0 1 $r1 $r2 1 |
$p0 iadd $r1 $r1 neg $r2 |
sched 0x7e0 0x7e0 0x7e0 |
$p0 iadd $r0 $r0 0x1 |
ret |
nop 0 |
|
// DIV S32, like DIV U32 after taking ABS(inputs) |
// |
// INPUT: $r0: dividend, $r1: divisor |
// OUTPUT: $r0: result, $r1: modulus |
// CLOBBER: $r2 - $r3, $p0 - $p3 |
// |
gm107_div_s32: |
sched 0x7e0 0x7e0 0x7e0 |
isetp lt and $p2 0x1 $r0 0 1 |
isetp lt xor $p3 1 $r1 0 $p2 |
i2i s32 s32 $r0 abs $r0 |
sched 0x7e0 0x7e0 0x7e0 |
i2i s32 s32 $r1 abs $r1 |
flo u32 $r2 $r1 |
lop xor 1 $r2 $r2 0x1f |
sched 0x7e0 0x7e0 0x7e0 |
mov $r3 0x1 0xf |
shl $r2 $r3 $r2 |
i2i u32 u32 $r1 neg $r1 |
sched 0x7e0 0x7e0 0x7e0 |
imul u32 u32 $r3 $r1 $r2 |
imad u32 u32 hi $r2 $r2 $r3 $r2 |
imul u32 u32 $r3 $r1 $r2 |
sched 0x7e0 0x7e0 0x7e0 |
imad u32 u32 hi $r2 $r2 $r3 $r2 |
imul u32 u32 $r3 $r1 $r2 |
imad u32 u32 hi $r2 $r2 $r3 $r2 |
sched 0x7e0 0x7e0 0x7e0 |
imul u32 u32 $r3 $r1 $r2 |
imad u32 u32 hi $r2 $r2 $r3 $r2 |
imul u32 u32 $r3 $r1 $r2 |
sched 0x7e0 0x7e0 0x7e0 |
imad u32 u32 hi $r2 $r2 $r3 $r2 |
mov $r3 $r0 0xf |
imul u32 u32 hi $r0 $r0 $r2 |
sched 0x7e0 0x7e0 0x7e0 |
i2i u32 u32 $r2 neg $r1 |
imad u32 u32 $r1 $r1 $r0 $r3 |
isetp ge u32 and $p0 1 $r1 $r2 1 |
sched 0x7e0 0x7e0 0x7e0 |
$p0 iadd $r1 $r1 neg $r2 |
$p0 iadd $r0 $r0 0x1 |
$p0 isetp ge u32 and $p0 1 $r1 $r2 1 |
sched 0x7e0 0x7e0 0x7e0 |
$p0 iadd $r1 $r1 neg $r2 |
$p0 iadd $r0 $r0 0x1 |
$p3 i2i s32 s32 $r0 neg $r0 |
sched 0x7e0 0x7e0 0x7e0 |
$p2 i2i s32 s32 $r1 neg $r1 |
ret |
nop 0 |
|
// STUB |
gm107_rcp_f64: |
gm107_rsq_f64: |
sched 0x7e0 0x7e0 0x7e0 |
ret |
nop 0 |
nop 0 |
|
.section #gm107_builtin_offsets |
.b64 #gm107_div_u32 |
.b64 #gm107_div_s32 |
.b64 #gm107_rcp_f64 |
.b64 #gm107_rsq_f64 |