Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. //
  2. // DIV U32
  3. //
  4. // UNR recurrence (q = a / b):
  5. // look for z such that 2^32 - b <= b * z < 2^32
  6. // then q - 1 <= (a * z) / 2^32 <= q
  7. //
  8. // INPUT:   $r0: dividend, $r1: divisor
  9. // OUTPUT:  $r0: result, $r1: modulus
  10. // CLOBBER: $r2 - $r3, $p0 - $p1
  11. // SIZE:    22 / 14 * 8 bytes
  12. //
  13. bfind u32 $r2 $r1
  14. xor b32 $r2 $r2 0x1f
  15. mov b32 $r3 0x1
  16. shl b32 $r2 $r3 clamp $r2
  17. cvt u32 $r1 neg u32 $r1
  18. mul $r3 u32 $r1 u32 $r2
  19. add $r2 (mul high u32 $r2 u32 $r3) $r2
  20. mul $r3 u32 $r1 u32 $r2
  21. add $r2 (mul high u32 $r2 u32 $r3) $r2
  22. mul $r3 u32 $r1 u32 $r2
  23. add $r2 (mul high u32 $r2 u32 $r3) $r2
  24. mul $r3 u32 $r1 u32 $r2
  25. add $r2 (mul high u32 $r2 u32 $r3) $r2
  26. mul $r3 u32 $r1 u32 $r2
  27. add $r2 (mul high u32 $r2 u32 $r3) $r2
  28. mov b32 $r3 $r0
  29. mul high $r0 u32 $r0 u32 $r2
  30. cvt u32 $r2 neg u32 $r1
  31. add $r1 (mul u32 $r1 u32 $r0) $r3
  32. set $p0 0x1 ge u32 $r1 $r2
  33. $p0 sub b32 $r1 $r1 $r2
  34. $p0 add b32 $r0 $r0 0x1
  35. $p0 set $p0 0x1 ge u32 $r1 $r2
  36. $p0 sub b32 $r1 $r1 $r2
  37. $p0 add b32 $r0 $r0 0x1
  38. ret
  39. //
  40. // DIV S32, like DIV U32 after taking ABS(inputs)
  41. //
  42. // INPUT:   $r0: dividend, $r1: divisor
  43. // OUTPUT:  $r0: result, $r1: modulus
  44. // CLOBBER: $r2 - $r3, $p0 - $p3
  45. //
  46. set $p2 0x1 lt s32 $r0 0x0
  47. set $p3 0x1 lt s32 $r1 0x0 xor $p2
  48. cvt s32 $r0 abs s32 $r0
  49. cvt s32 $r1 abs s32 $r1
  50. bfind u32 $r2 $r1
  51. xor b32 $r2 $r2 0x1f
  52. mov b32 $r3 0x1
  53. shl b32 $r2 $r3 clamp $r2
  54. cvt u32 $r1 neg u32 $r1
  55. mul $r3 u32 $r1 u32 $r2
  56. add $r2 (mul high u32 $r2 u32 $r3) $r2
  57. mul $r3 u32 $r1 u32 $r2
  58. add $r2 (mul high u32 $r2 u32 $r3) $r2
  59. mul $r3 u32 $r1 u32 $r2
  60. add $r2 (mul high u32 $r2 u32 $r3) $r2
  61. mul $r3 u32 $r1 u32 $r2
  62. add $r2 (mul high u32 $r2 u32 $r3) $r2
  63. mul $r3 u32 $r1 u32 $r2
  64. add $r2 (mul high u32 $r2 u32 $r3) $r2
  65. mov b32 $r3 $r0
  66. mul high $r0 u32 $r0 u32 $r2
  67. cvt u32 $r2 neg u32 $r1
  68. add $r1 (mul u32 $r1 u32 $r0) $r3
  69. set $p0 0x1 ge u32 $r1 $r2
  70. $p0 sub b32 $r1 $r1 $r2
  71. $p0 add b32 $r0 $r0 0x1
  72. $p0 set $p0 0x1 ge u32 $r1 $r2
  73. $p0 sub b32 $r1 $r1 $r2
  74. $p0 add b32 $r0 $r0 0x1
  75. $p3 cvt s32 $r0 neg s32 $r0
  76. $p2 cvt s32 $r1 neg s32 $r1
  77. ret
  78. //
  79. // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
  80. //
  81. // INPUT:   $r0d (x)
  82. // OUTPUT:  $r0d (rcp(x))
  83. // CLOBBER: $r2 - $r7
  84. // SIZE:    9 * 8 bytes
  85. //
  86. nop
  87. ret
  88. // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
  89. //
  90. // INPUT:   $r0d (x)
  91. // OUTPUT:  $r0d (rsqrt(x))
  92. // CLOBBER: $r2 - $r7
  93. // SIZE:    14 * 8 bytes
  94. //
  95. nop
  96. ret
  97.