Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. //
  2. // DIV U32
  3. //
  4. // UNR recurrence (q = a / b):
  5. // look for z such that 2^32 - b <= b * z < 2^32
  6. // then q - 1 <= (a * z) / 2^32 <= q
  7. //
  8. // INPUT:   $r0: dividend, $r1: divisor
  9. // OUTPUT:  $r0: result, $r1: modulus
  10. // CLOBBER: $r2 - $r3, $p0 - $p1
  11. // SIZE:    22 / 14 * 8 bytes
  12. //
  13. sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28
  14. bfind u32 $r2 $r1
  15. long xor b32 $r2 $r2 0x1f
  16. long mov b32 $r3 0x1
  17. shl b32 $r2 $r3 clamp $r2
  18. long cvt u32 $r1 neg u32 $r1
  19. long mul $r3 u32 $r1 u32 $r2
  20. add $r2 (mul high u32 $r2 u32 $r3) $r2
  21. sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
  22. mul $r3 u32 $r1 u32 $r2
  23. add $r2 (mul high u32 $r2 u32 $r3) $r2
  24. mul $r3 u32 $r1 u32 $r2
  25. add $r2 (mul high u32 $r2 u32 $r3) $r2
  26. mul $r3 u32 $r1 u32 $r2
  27. add $r2 (mul high u32 $r2 u32 $r3) $r2
  28. mul $r3 u32 $r1 u32 $r2
  29. sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4
  30. add $r2 (mul high u32 $r2 u32 $r3) $r2
  31. mov b32 $r3 $r0
  32. mul high $r0 u32 $r0 u32 $r2
  33. long cvt u32 $r2 neg u32 $r1
  34. long add $r1 (mul u32 $r1 u32 $r0) $r3
  35. set $p0 0x1 ge u32 $r1 $r2
  36. $p0 sub b32 $r1 $r1 $r2
  37. sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20
  38. $p0 add b32 $r0 $r0 0x1
  39. $p0 set $p0 0x1 ge u32 $r1 $r2
  40. $p0 sub b32 $r1 $r1 $r2
  41. $p0 add b32 $r0 $r0 0x1
  42. long ret
  43. //
  44. // DIV S32, like DIV U32 after taking ABS(inputs)
  45. //
  46. // INPUT:   $r0: dividend, $r1: divisor
  47. // OUTPUT:  $r0: result, $r1: modulus
  48. // CLOBBER: $r2 - $r3, $p0 - $p3
  49. //
  50. set $p2 0x1 lt s32 $r0 0x0
  51. set $p3 0x1 lt s32 $r1 0x0 xor $p2
  52. sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28
  53. long cvt s32 $r0 abs s32 $r0
  54. long cvt s32 $r1 abs s32 $r1
  55. bfind u32 $r2 $r1
  56. long xor b32 $r2 $r2 0x1f
  57. long mov b32 $r3 0x1
  58. shl b32 $r2 $r3 clamp $r2
  59. cvt u32 $r1 neg u32 $r1
  60. sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
  61. mul $r3 u32 $r1 u32 $r2
  62. add $r2 (mul high u32 $r2 u32 $r3) $r2
  63. mul $r3 u32 $r1 u32 $r2
  64. add $r2 (mul high u32 $r2 u32 $r3) $r2
  65. mul $r3 u32 $r1 u32 $r2
  66. add $r2 (mul high u32 $r2 u32 $r3) $r2
  67. mul $r3 u32 $r1 u32 $r2
  68. sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28
  69. add $r2 (mul high u32 $r2 u32 $r3) $r2
  70. mul $r3 u32 $r1 u32 $r2
  71. add $r2 (mul high u32 $r2 u32 $r3) $r2
  72. mov b32 $r3 $r0
  73. mul high $r0 u32 $r0 u32 $r2
  74. long cvt u32 $r2 neg u32 $r1
  75. long add $r1 (mul u32 $r1 u32 $r0) $r3
  76. sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
  77. set $p0 0x1 ge u32 $r1 $r2
  78. $p0 sub b32 $r1 $r1 $r2
  79. $p0 add b32 $r0 $r0 0x1
  80. $p0 set $p0 0x1 ge u32 $r1 $r2
  81. $p0 sub b32 $r1 $r1 $r2
  82. long $p0 add b32 $r0 $r0 0x1
  83. long $p3 cvt s32 $r0 neg s32 $r0
  84. sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
  85. $p2 cvt s32 $r1 neg s32 $r1
  86. long ret
  87. //
  88. // SULDP [for each format]
  89. // $r4d: address
  90. // $r2: surface info (format)
  91. // $p0: access predicate
  92. // $p1, $p2: caching predicate (00: cv, 01: ca, 10: cg)
  93. //
  94. // RGBA32
  95. $p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
  96. set $p1 0x1 $p1 xor not $p2
  97. $p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
  98. $p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
  99. long ret
  100. // RGBA16_UNORM
  101. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  102. $p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
  103. set $p1 0x1 $p1 xor not $p2
  104. $p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
  105. $p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
  106. cvt rn f32 $r3 u16 1 $r1
  107. cvt rn f32 $r2 u16 0 $r1
  108. mul f32 $r3 $r3 0x37800074
  109. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  110. cvt rn f32 $r1 u16 1 $r0
  111. mul f32 $r2 $r2 0x37800074
  112. cvt rn f32 $r0 u16 0 $r0
  113. mul f32 $r1 $r1 0x37800074
  114. mul f32 $r0 $r0 0x37800074
  115. long ret
  116. // RGBA16_SNORM
  117. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  118. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  119. set $p1 0x1 $p1 xor not $p2
  120. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  121. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  122. cvt rn f32 $r3 s16 1 $r1
  123. cvt rn f32 $r2 s16 0 $r1
  124. mul f32 $r3 $r3 0x38000187
  125. cvt rn f32 $r1 s16 1 $r0
  126. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  127. mul f32 $r2 $r2 0x38000187
  128. cvt rn f32 $r0 s16 0 $r0
  129. mul f32 $r1 $r1 0x38000187
  130. mul f32 $r0 $r0 0x38000187
  131. long ret
  132. // RGBA16_SINT
  133. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  134. set $p1 0x1 $p1 xor not $p2
  135. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  136. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  137. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  138. cvt s32 $r3 s16 1 $r1
  139. cvt s32 $r2 s16 0 $r1
  140. cvt s32 $r1 s16 1 $r0
  141. cvt s32 $r0 s16 0 $r0
  142. long ret
  143. // RGBA16_UINT
  144. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  145. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  146. set $p1 0x1 $p1 xor not $p2
  147. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  148. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  149. cvt u32 $r3 u16 1 $r1
  150. cvt u32 $r2 u16 0 $r1
  151. cvt u32 $r1 u16 1 $r0
  152. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  153. cvt u32 $r0 u16 0 $r0
  154. long ret
  155. // RGBA16_FLOAT
  156. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  157. set $p1 0x1 $p1 xor not $p2
  158. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  159. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  160. cvt f32 $r3 f16 $r1 1
  161. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  162. cvt f32 $r2 f16 $r1 0
  163. cvt f32 $r1 f16 $r0 1
  164. cvt f32 $r0 f16 $r0 0
  165. long ret
  166. // RG32_FLOAT
  167. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  168. set $p1 0x1 $p1 xor not $p2
  169. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  170. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  171. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  172. long mov b32 $r2 0x00000000
  173. long mov b32 $r3 0x3f800000
  174. long ret
  175. // RG32_xINT
  176. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  177. set $p1 0x1 $p1 xor not $p2
  178. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  179. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  180. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  181. long mov b32 $r2 0x00000000
  182. long mov b32 $r3 0x00000001
  183. long ret
  184. // RGB10A2_UNORM
  185. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  186. set $p1 0x1 $p1 xor not $p2
  187. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  188. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  189. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  190. ext u32 $r1 $r0 0x0a0a
  191. long mov b32 $r3 0x3f800000
  192. ext u32 $r2 $r0 0x0a14
  193. long and b32 $r0 $r0 0x3ff
  194. cvt rn f32 $r2 u16 0 $r2
  195. cvt rn f32 $r1 u16 0 $r1
  196. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  197. mul f32 $r2 $r2 0x3a802007
  198. cvt rn f32 $r0 u16 0 $r0
  199. mul f32 $r1 $r1 0x3a802007
  200. mul f32 $r0 $r0 0x3a802007
  201. long ret
  202. // RGB10A2_UINT
  203. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  204. set $p1 0x1 $p1 xor not $p2
  205. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  206. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  207. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  208. ext u32 $r1 $r0 0x0a0a
  209. long mov b32 $r3 0x00000001
  210. ext u32 $r2 $r0 0x0a14
  211. long and b32 $r0 $r0 0x3ff
  212. long ret
  213. // RGBA8_UNORM
  214. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  215. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  216. set $p1 0x1 $p1 xor not $p2
  217. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  218. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  219. cvt rn f32 $r3 u8 3 $r0
  220. cvt rn f32 $r2 u8 2 $r0
  221. mul f32 $r3 $r3 0x3b808081
  222. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  223. cvt rn f32 $r1 u8 1 $r0
  224. mul f32 $r2 $r2 0x3b808081
  225. cvt rn f32 $r0 u8 0 $r0
  226. mul f32 $r1 $r1 0x3b808081
  227. mul f32 $r0 $r0 0x3b808081
  228. long ret
  229. // RGBA8_SNORM
  230. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  231. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  232. set $p1 0x1 $p1 xor not $p2
  233. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  234. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  235. cvt rn f32 $r3 s8 3 $r0
  236. cvt rn f32 $r2 s8 2 $r0
  237. mul f32 $r3 $r3 0x3c010204
  238. cvt rn f32 $r1 s8 1 $r0
  239. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  240. mul f32 $r2 $r2 0x3c010204
  241. cvt rn f32 $r0 s8 0 $r0
  242. mul f32 $r1 $r1 0x3c010204
  243. mul f32 $r0 $r0 0x3c010204
  244. long ret
  245. // RGBA8_SINT
  246. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  247. set $p1 0x1 $p1 xor not $p2
  248. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  249. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  250. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  251. cvt s32 $r3 s8 3 $r0
  252. cvt s32 $r2 s8 2 $r0
  253. cvt s32 $r1 s8 1 $r0
  254. cvt s32 $r0 s8 0 $r0
  255. long ret
  256. // RGBA8_UINT
  257. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  258. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  259. set $p1 0x1 $p1 xor not $p2
  260. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  261. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  262. cvt u32 $r3 u8 3 $r0
  263. cvt u32 $r2 u8 2 $r0
  264. cvt u32 $r1 u8 1 $r0
  265. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  266. cvt u32 $r0 u8 0 $r0
  267. long ret
  268. // R5G6B5_UNORM
  269. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  270. set $p1 0x1 $p1 xor not $p2
  271. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  272. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  273. ext u32 $r1 $r0 0x0605
  274. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  275. long mov b32 $r3 0x3f800000
  276. ext u32 $r2 $r0 0x050b
  277. long and b32 $r0 $r0 0x1f
  278. cvt rn f32 $r2 u8 0 $r2
  279. cvt rn f32 $r1 u8 0 $r1
  280. mul f32 $r2 $r2 0x3d042108
  281. cvt rn f32 $r0 u8 0 $r0
  282. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  283. mul f32 $r1 $r1 0x3c820821
  284. mul f32 $r0 $r0 0x3d042108
  285. long ret
  286. // R5G5B5X1_UNORM
  287. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  288. set $p1 0x1 $p1 xor not $p2
  289. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  290. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  291. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  292. ext u32 $r1 $r0 0x0505
  293. ext u32 $r2 $r0 0x050a
  294. long and b32 $r0 $r0 0x1f
  295. long mov b32 $r3 0x3f800000
  296. cvt rn f32 $r2 u8 0 $r2
  297. cvt rn f32 $r1 u8 0 $r1
  298. cvt rn f32 $r0 u8 0 $r0
  299. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  300. mul f32 $r2 $r2 0x3d042108
  301. mul f32 $r1 $r1 0x3d042108
  302. mul f32 $r0 $r0 0x3d042108
  303. long ret
  304. // RG16_UNORM
  305. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  306. set $p1 0x1 $p1 xor not $p2
  307. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  308. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  309. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  310. cvt rn f32 $r1 u16 1 $r0
  311. cvt rn f32 $r0 u16 0 $r0
  312. mul f32 $r1 $r1 0x37800074
  313. mul f32 $r0 $r0 0x37800074
  314. long mov b32 $r2 0x00000000
  315. long mov b32 $r3 0x3f800000
  316. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  317. long ret
  318. // RG16_SNORM
  319. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  320. set $p1 0x1 $p1 xor not $p2
  321. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  322. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  323. mov b32 $r3 0x3f800000
  324. cvt rn f32 $r1 s16 1 $r0
  325. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  326. mov b32 $r2 0x00000000
  327. cvt rn f32 $r0 s16 0 $r0
  328. mul f32 $r1 $r1 0x38000187
  329. mul f32 $r0 $r0 0x38000187
  330. long ret
  331. // RG16_SINT
  332. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  333. set $p1 0x1 $p1 xor not $p2
  334. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  335. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  336. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  337. mov b32 $r3 0x00000001
  338. cvt s32 $r1 s16 1 $r0
  339. mov b32 $r2 0x00000000
  340. cvt s32 $r0 s16 0 $r0
  341. long ret
  342. // RG16_UINT
  343. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  344. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  345. set $p1 0x1 $p1 xor not $p2
  346. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  347. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  348. mov b32 $r3 0x00000001
  349. cvt u32 $r1 u16 1 $r0
  350. mov b32 $r2 0x00000000
  351. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  352. cvt u32 $r0 u16 0 $r0
  353. long ret
  354. // RG16_FLOAT
  355. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  356. set $p1 0x1 $p1 xor not $p2
  357. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  358. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  359. mov b32 $r3 0x3f800000
  360. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  361. cvt f32 $r1 f16 $r0 1
  362. mov b32 $r2 0x00000000
  363. cvt f32 $r0 f16 $r0 0
  364. long ret
  365. // R32_FLOAT
  366. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  367. set $p1 0x1 $p1 xor not $p2
  368. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  369. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  370. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  371. long mov b32 $r3 0x3f800000
  372. long mov b32 $r2 0x00000000
  373. long mov b32 $r1 0x00000000
  374. long ret
  375. // R32_xINT
  376. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  377. set $p1 0x1 $p1 xor not $p2
  378. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  379. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  380. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  381. long mov b32 $r3 0x00000001
  382. long mov b32 $r2 0x00000000
  383. long mov b32 $r1 0x00000000
  384. long ret
  385. // RG8_UNORM
  386. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  387. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  388. set $p1 0x1 $p1 xor not $p2
  389. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  390. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  391. mov b32 $r3 0x3f800000
  392. cvt rn f32 $r1 u8 1 $r0
  393. mov b32 $r2 0x00000000
  394. cvt rn f32 $r0 u8 0 $r0
  395. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  396. mul f32 $r1 $r1 0x3b808081
  397. mul f32 $r0 $r0 0x3b808081
  398. long ret
  399. // RG8_SNORM
  400. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  401. set $p1 0x1 $p1 xor not $p2
  402. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  403. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  404. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  405. long mov b32 $r3 0x3f800000
  406. cvt rn f32 $r1 s8 1 $r0
  407. long mov b32 $r2 0x00000000
  408. cvt rn f32 $r0 s8 0 $r0
  409. mul f32 $r1 $r1 0x3c010204
  410. mul f32 $r0 $r0 0x3c010204
  411. long ret
  412. // RG8_UINT
  413. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  414. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  415. set $p1 0x1 $p1 xor not $p2
  416. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  417. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  418. long mov b32 $r3 0x00000001
  419. cvt u32 $r1 u8 1 $r0
  420. long mov b32 $r2 0x00000000
  421. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  422. cvt u32 $r0 u8 0 $r0
  423. long ret
  424. // RG8_SINT
  425. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  426. set $p1 0x1 $p1 xor not $p2
  427. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  428. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  429. long mov b32 $r3 0x00000001
  430. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  431. cvt s32 $r1 s8 1 $r0
  432. long mov b32 $r2 0x00000000
  433. cvt s32 $r0 s8 0 $r0
  434. long ret
  435. // R16_UNORM
  436. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  437. set $p1 0x1 $p1 xor not $p2
  438. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  439. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  440. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  441. long mov b32 $r3 0x3f800000
  442. cvt rn f32 $r0 u16 0 $r0
  443. long mov b32 $r2 0x00000000
  444. long mov b32 $r1 0x00000000
  445. mul f32 $r0 $r0 0x37800074
  446. long ret
  447. // R16_SNORM
  448. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  449. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  450. set $p1 0x1 $p1 xor not $p2
  451. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  452. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  453. mov b32 $r3 0x3f800000
  454. cvt rn f32 $r0 s16 0 $r0
  455. long mov b32 $r2 0x00000000
  456. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  457. long mov b32 $r1 0x00000000
  458. mul f32 $r0 $r0 0x38000187
  459. long ret
  460. // R16_SINT
  461. $p1 suldgb s16 $r0 ca zero u8 g[$r4d] $r2 $p0
  462. set $p1 0x1 $p1 xor not $p2
  463. $p2 suldgb s16 $r0 cg zero u8 g[$r4d] $r2 $p0
  464. $p1 suldgb s16 $r0 cv zero u8 g[$r4d] $r2 $p0
  465. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  466. long mov b32 $r3 0x00000001
  467. long mov b32 $r2 0x00000000
  468. long mov b32 $r1 0x00000000
  469. long ret
  470. // R16_UINT
  471. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  472. set $p1 0x1 $p1 xor not $p2
  473. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  474. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  475. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  476. long mov b32 $r3 0x00000001
  477. long mov b32 $r2 0x00000000
  478. long mov b32 $r1 0x00000000
  479. long ret
  480. // R16_FLOAT
  481. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  482. set $p1 0x1 $p1 xor not $p2
  483. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  484. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  485. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  486. long mov b32 $r3 0x3f800000
  487. long mov b32 $r2 0x00000000
  488. cvt f32 $r0 f16 $r0 0
  489. mov b32 $r1 0x00000000
  490. long ret
  491. // R8_UNORM
  492. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  493. $p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
  494. set $p1 0x1 $p1 xor not $p2
  495. $p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
  496. $p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
  497. mov b32 $r3 0x3f800000
  498. cvt rn f32 $r0 u8 0 $r0
  499. mov b32 $r2 0x00000000
  500. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  501. mul f32 $r0 $r0 0x3b808081
  502. mov b32 $r1 0x00000000
  503. long ret
  504. // R8_SNORM
  505. $p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
  506. set $p1 0x1 $p1 xor not $p2
  507. $p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
  508. $p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
  509. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  510. mov b32 $r3 0x3f800000
  511. cvt rn f32 $r0 s8 0 $r0
  512. mov b32 $r2 0x00000000
  513. mul f32 $r0 $r0 0x3c010204
  514. mov b32 $r1 0x00000000
  515. long ret
  516. // R8_SINT
  517. $p1 suldgb s8 $r0 ca zero u8 g[$r4d] $r2 $p0
  518. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  519. set $p1 0x1 $p1 xor not $p2
  520. $p2 suldgb s8 $r0 cg zero u8 g[$r4d] $r2 $p0
  521. $p1 suldgb s8 $r0 cv zero u8 g[$r4d] $r2 $p0
  522. long mov b32 $r3 0x00000001
  523. long mov b32 $r2 0x00000000
  524. long mov b32 $r1 0x00000000
  525. long ret
  526. // R8_UINT
  527. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  528. $p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
  529. set $p1 0x1 $p1 xor not $p2
  530. $p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
  531. $p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
  532. long mov b32 $r3 0x00000001
  533. long mov b32 $r2 0x00000000
  534. long mov b32 $r1 0x00000000
  535. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  536. long ret
  537. // R11G11B10_FLOAT TODO
  538. $p1 suldgb b32 $r3 ca zero u8 g[$r4d] $r2 $p0
  539. set $p1 0x1 $p1 xor not $p2
  540. $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
  541. $p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
  542. long mov b32 $r3 0x3f800000
  543. long nop
  544. long ret
  545. //
  546. // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
  547. //
  548. // INPUT:   $r0d (x)
  549. // OUTPUT:  $r0d (rcp(x))
  550. // CLOBBER: $r2 - $r7
  551. // SIZE:    9 * 8 bytes
  552. //
  553. long nop
  554. long ret
  555. // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
  556. //
  557. // INPUT:   $r0d (x)
  558. // OUTPUT:  $r0d (rsqrt(x))
  559. // CLOBBER: $r2 - $r7
  560. // SIZE:    14 * 8 bytes
  561. //
  562. long nop
  563. long ret
  564. //
  565. // Trap handler.
  566. // Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.
  567. // Low 32 bytes of l[] memory shouldn't be used if resumeability is required.
  568. //
  569. // Trap info:
  570. // 0x000: mutex
  571. // 0x004: PC
  572. // 0x008: trapstat
  573. // 0x00c: warperr
  574. // 0x010: tidx
  575. // 0x014: tidy
  576. // 0x018: tidz
  577. // 0x01c: ctaidx
  578. // 0x020: ctaidy
  579. // 0x024: ctaidz
  580. // 0x030: $r0q
  581. // 0x130: $flags
  582. // 0x140: s[]
  583. //
  584. st b128 wb l[0x00] $r0q
  585. // check state of the warp and continue if it didn't cause the trap
  586. long mov b32 $r1 $trapstat
  587. long mov b32 $r3 $warperr
  588. mov $r2 $flags mask 0xffff
  589. and b32 0 $c $r1 $r3
  590. e $c bra #end_cont
  591. // spill control flow stack to l[]
  592. long mov b32 $r3 16
  593. spill_cfstack:
  594. preret #end_exit
  595. sub b32 $r3 $c $r3 0x1
  596. lg $c bra #spill_cfstack
  597. // retrieve pointer to trap info
  598. mov b32 $r0 c0[0x1900]
  599. mov b32 $r1 c0[0x1904]
  600. // we only let a single faulting thread store its state
  601. mov b32 $r3 0x1
  602. exch b32 $r3 g[$r0d] $r3
  603. joinat #end_exit
  604. set $p0 0x1 eq u32 $r3 0x1
  605. join $p0 nop
  606. // store $c and $p registers
  607. st b32 wb g[$r0d+0x130] $r2
  608. // store $trapstat and $warperr
  609. long mov b32 $r2 $trapstat
  610. long mov b32 $r3 $warperr
  611. st b64 wb g[$r0d+0x8] $r2d
  612. // store registers
  613. st b128 wb g[$r0d+0x40] $r4q
  614. st b128 wb g[$r0d+0x50] $r8q
  615. st b128 wb g[$r0d+0x60] $r12q
  616. st b128 wb g[$r0d+0x70] $r16q
  617. st b128 wb g[$r0d+0x80] $r20q
  618. st b128 wb g[$r0d+0x90] $r24q
  619. st b128 wb g[$r0d+0xa0] $r28q
  620. st b128 wb g[$r0d+0xb0] $r32q
  621. st b128 wb g[$r0d+0xc0] $r36q
  622. st b128 wb g[$r0d+0xd0] $r40q
  623. st b128 wb g[$r0d+0xe0] $r44q
  624. st b128 wb g[$r0d+0xf0] $r48q
  625. st b128 wb g[$r0d+0x100] $r52q
  626. st b128 wb g[$r0d+0x110] $r56q
  627. st b128 wb g[$r0d+0x120] $r60q
  628. ld b64 $r2d cs l[0x0]
  629. st b64 wb g[$r0d+0x30] $r2d
  630. ld b64 $r2d cs l[0x8]
  631. st b64 wb g[$r0d+0x38] $r2d
  632. // store thread id
  633. long mov b32 $r2 $tidx
  634. long mov b32 $r3 $tidy
  635. st b64 wb g[$r0d+0x10] $r2d
  636. long mov b32 $r2 $tidz
  637. long mov b32 $r3 $ctaidx
  638. st b64 wb g[$r0d+0x18] $r2d
  639. long mov b32 $r2 $ctaidy
  640. long mov b32 $r3 $ctaidz
  641. st b64 wb g[$r0d+0x20] $r2d
  642. // store shared memory (in reverse order so $r0d is base again at the end)
  643. long mov b32 $r3 $smemsz
  644. sub b32 $r3 $c $r3 0x4
  645. s $c bra #shared_done
  646. add b32 $r0 $c $r0 $r3
  647. add b32 $r1 $r1 0x0 $c
  648. shared_loop:
  649. long ld b32 $r2 s[$r3]
  650. long st b32 wb g[$r0d+0x140] $r2
  651. sub b32 $r0 $c $r0 0x4
  652. sub b32 $r1 $r1 0x0 $c
  653. sub b32 $r3 $c $r3 0x4
  654. lg $c bra #shared_loop
  655. shared_done:
  656. // search the stack for trap entry to retrieve PC
  657. mov b32 $r0 c0[0x1908]
  658. mov b32 $r1 c0[0x190c]
  659. membar sys
  660. // invalidate caches so we can read stack entries via g[]
  661. cctl ivall 0 l[0]
  662. cctl ivall 0 g[$r0d]
  663. // get offsets
  664. mov b32 $r2 $physid
  665. ext u32 $r3 $r2 0x0814 // MP id
  666. ext u32 $r2 $r2 0x0608 // warp id
  667. mul $r2 u32 $r2 u32 c0[0x1914] // warp offset
  668. mul $r3 u32 $r3 u32 c0[0x1910] // MP offset
  669. add b32 $r2 $r2 $r3 // MP + warp offset
  670. add b32 $r0 $c $r0 $r2
  671. add b32 $r1 $r1 0x0 $c
  672. search_cstack:
  673. mov b32 $r3 c0[0x1918] // cstack size
  674. ld u8 $r2 cv g[$r0d+0x8]
  675. set $p0 0x1 eq u32 $r2 0xa
  676. $p0 bra #entry_found
  677. add b32 $r0 $c $r0 0x10
  678. add b32 $r1 $r1 0x0 $c
  679. sub b32 $r3 $c $r3 0x10
  680. lg $c bra #search_cstack
  681. bra #end_exit
  682. entry_found:
  683. // load PC (may be unaligned and spread out)
  684. ld b32 $r2 cv g[$r0d]
  685. mov b32 $r0 c0[0x1900]
  686. mov b32 $r1 c0[0x1904]
  687. st b32 wb g[$r0d+0x4] $r2
  688. join nop
  689. // invalidate caches and exit
  690. end_exit:
  691. cctl ivall 0 g[0]
  692. bpt pause 0x0
  693. rtt terminate
  694. end_cont:
  695. bpt pause 0x0
  696. mov $flags $r2 mask 0xffff
  697. ld b128 $r0q cs l[0x00]
  698. rtt
  699.