Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4358 | Serge | 1 | // |
2 | // DIV U32 |
||
3 | // |
||
4 | // UNR recurrence (q = a / b): |
||
5 | // look for z such that 2^32 - b <= b * z < 2^32 |
||
6 | // then q - 1 <= (a * z) / 2^32 <= q |
||
7 | // |
||
8 | // INPUT: $r0: dividend, $r1: divisor |
||
9 | // OUTPUT: $r0: result, $r1: modulus |
||
10 | // CLOBBER: $r2 - $r3, $p0 - $p1 |
||
11 | // SIZE: 22 / 14 * 8 bytes |
||
12 | // |
||
13 | bfind u32 $r2 $r1 |
||
14 | xor b32 $r2 $r2 0x1f |
||
15 | mov b32 $r3 0x1 |
||
16 | shl b32 $r2 $r3 clamp $r2 |
||
17 | cvt u32 $r1 neg u32 $r1 |
||
18 | mul $r3 u32 $r1 u32 $r2 |
||
19 | add $r2 (mul high u32 $r2 u32 $r3) $r2 |
||
20 | mul $r3 u32 $r1 u32 $r2 |
||
21 | add $r2 (mul high u32 $r2 u32 $r3) $r2 |
||
22 | mul $r3 u32 $r1 u32 $r2 |
||
23 | add $r2 (mul high u32 $r2 u32 $r3) $r2 |
||
24 | mul $r3 u32 $r1 u32 $r2 |
||
25 | add $r2 (mul high u32 $r2 u32 $r3) $r2 |
||
26 | mul $r3 u32 $r1 u32 $r2 |
||
27 | add $r2 (mul high u32 $r2 u32 $r3) $r2 |
||
28 | mov b32 $r3 $r0 |
||
29 | mul high $r0 u32 $r0 u32 $r2 |
||
30 | cvt u32 $r2 neg u32 $r1 |
||
31 | add $r1 (mul u32 $r1 u32 $r0) $r3 |
||
32 | set $p0 0x1 ge u32 $r1 $r2 |
||
33 | $p0 sub b32 $r1 $r1 $r2 |
||
34 | $p0 add b32 $r0 $r0 0x1 |
||
35 | $p0 set $p0 0x1 ge u32 $r1 $r2 |
||
36 | $p0 sub b32 $r1 $r1 $r2 |
||
37 | $p0 add b32 $r0 $r0 0x1 |
||
38 | ret |
||
39 | // |
||
40 | // DIV S32, like DIV U32 after taking ABS(inputs) |
||
41 | // |
||
42 | // INPUT: $r0: dividend, $r1: divisor |
||
43 | // OUTPUT: $r0: result, $r1: modulus |
||
44 | // CLOBBER: $r2 - $r3, $p0 - $p3 |
||
45 | // |
||
46 | set $p2 0x1 lt s32 $r0 0x0 |
||
47 | set $p3 0x1 lt s32 $r1 0x0 xor $p2 |
||
48 | cvt s32 $r0 abs s32 $r0 |
||
49 | cvt s32 $r1 abs s32 $r1 |
||
50 | bfind u32 $r2 $r1 |
||
51 | xor b32 $r2 $r2 0x1f |
||
52 | mov b32 $r3 0x1 |
||
53 | shl b32 $r2 $r3 clamp $r2 |
||
54 | cvt u32 $r1 neg u32 $r1 |
||
55 | mul $r3 u32 $r1 u32 $r2 |
||
56 | add $r2 (mul high u32 $r2 u32 $r3) $r2 |
||
57 | mul $r3 u32 $r1 u32 $r2 |
||
58 | add $r2 (mul high u32 $r2 u32 $r3) $r2 |
||
59 | mul $r3 u32 $r1 u32 $r2 |
||
60 | add $r2 (mul high u32 $r2 u32 $r3) $r2 |
||
61 | mul $r3 u32 $r1 u32 $r2 |
||
62 | add $r2 (mul high u32 $r2 u32 $r3) $r2 |
||
63 | mul $r3 u32 $r1 u32 $r2 |
||
64 | add $r2 (mul high u32 $r2 u32 $r3) $r2 |
||
65 | mov b32 $r3 $r0 |
||
66 | mul high $r0 u32 $r0 u32 $r2 |
||
67 | cvt u32 $r2 neg u32 $r1 |
||
68 | add $r1 (mul u32 $r1 u32 $r0) $r3 |
||
69 | set $p0 0x1 ge u32 $r1 $r2 |
||
70 | $p0 sub b32 $r1 $r1 $r2 |
||
71 | $p0 add b32 $r0 $r0 0x1 |
||
72 | $p0 set $p0 0x1 ge u32 $r1 $r2 |
||
73 | $p0 sub b32 $r1 $r1 $r2 |
||
74 | $p0 add b32 $r0 $r0 0x1 |
||
75 | $p3 cvt s32 $r0 neg s32 $r0 |
||
76 | $p2 cvt s32 $r1 neg s32 $r1 |
||
77 | ret |
||
78 | // |
||
79 | // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) |
||
80 | // |
||
81 | // INPUT: $r0d (x) |
||
82 | // OUTPUT: $r0d (rcp(x)) |
||
83 | // CLOBBER: $r2 - $r7 |
||
84 | // SIZE: 9 * 8 bytes |
||
85 | // |
||
86 | nop |
||
87 | ret |
||
88 | // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) |
||
89 | // |
||
90 | // INPUT: $r0d (x) |
||
91 | // OUTPUT: $r0d (rsqrt(x)) |
||
92 | // CLOBBER: $r2 - $r7 |
||
93 | // SIZE: 14 * 8 bytes |
||
94 | // |
||
95 | nop |
||
96 | ret=>=>>=> |