Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. .section #gk104_builtin_code
  2. // DIV U32
  3. //
  4. // UNR recurrence (q = a / b):
  5. // look for z such that 2^32 - b <= b * z < 2^32
  6. // then q - 1 <= (a * z) / 2^32 <= q
  7. //
  8. // INPUT:   $r0: dividend, $r1: divisor
  9. // OUTPUT:  $r0: result, $r1: modulus
  10. // CLOBBER: $r2 - $r3, $p0 - $p1
  11. // SIZE:    22 / 14 * 8 bytes
  12. //
  13. gk104_div_u32:
  14.    sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28
  15.    bfind u32 $r2 $r1
  16.    long xor b32 $r2 $r2 0x1f
  17.    long mov b32 $r3 0x1
  18.    shl b32 $r2 $r3 clamp $r2
  19.    long cvt u32 $r1 neg u32 $r1
  20.    long mul $r3 u32 $r1 u32 $r2
  21.    add $r2 (mul high u32 $r2 u32 $r3) $r2
  22.    sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
  23.    mul $r3 u32 $r1 u32 $r2
  24.    add $r2 (mul high u32 $r2 u32 $r3) $r2
  25.    mul $r3 u32 $r1 u32 $r2
  26.    add $r2 (mul high u32 $r2 u32 $r3) $r2
  27.    mul $r3 u32 $r1 u32 $r2
  28.    add $r2 (mul high u32 $r2 u32 $r3) $r2
  29.    mul $r3 u32 $r1 u32 $r2
  30.    sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4
  31.    add $r2 (mul high u32 $r2 u32 $r3) $r2
  32.    mov b32 $r3 $r0
  33.    mul high $r0 u32 $r0 u32 $r2
  34.    long cvt u32 $r2 neg u32 $r1
  35.    long add $r1 (mul u32 $r1 u32 $r0) $r3
  36.    set $p0 0x1 ge u32 $r1 $r2
  37.    $p0 sub b32 $r1 $r1 $r2
  38.    sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20
  39.    $p0 add b32 $r0 $r0 0x1
  40.    $p0 set $p0 0x1 ge u32 $r1 $r2
  41.    $p0 sub b32 $r1 $r1 $r2
  42.    $p0 add b32 $r0 $r0 0x1
  43.    long ret
  44.  
  45. // DIV S32, like DIV U32 after taking ABS(inputs)
  46. //
  47. // INPUT:   $r0: dividend, $r1: divisor
  48. // OUTPUT:  $r0: result, $r1: modulus
  49. // CLOBBER: $r2 - $r3, $p0 - $p3
  50. //
  51. gk104_div_s32:
  52.    set $p2 0x1 lt s32 $r0 0x0
  53.    set $p3 0x1 lt s32 $r1 0x0 xor $p2
  54.    sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28
  55.    long cvt s32 $r0 abs s32 $r0
  56.    long cvt s32 $r1 abs s32 $r1
  57.    bfind u32 $r2 $r1
  58.    long xor b32 $r2 $r2 0x1f
  59.    long mov b32 $r3 0x1
  60.    shl b32 $r2 $r3 clamp $r2
  61.    cvt u32 $r1 neg u32 $r1
  62.    sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
  63.    mul $r3 u32 $r1 u32 $r2
  64.    add $r2 (mul high u32 $r2 u32 $r3) $r2
  65.    mul $r3 u32 $r1 u32 $r2
  66.    add $r2 (mul high u32 $r2 u32 $r3) $r2
  67.    mul $r3 u32 $r1 u32 $r2
  68.    add $r2 (mul high u32 $r2 u32 $r3) $r2
  69.    mul $r3 u32 $r1 u32 $r2
  70.    sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28
  71.    add $r2 (mul high u32 $r2 u32 $r3) $r2
  72.    mul $r3 u32 $r1 u32 $r2
  73.    add $r2 (mul high u32 $r2 u32 $r3) $r2
  74.    mov b32 $r3 $r0
  75.    mul high $r0 u32 $r0 u32 $r2
  76.    long cvt u32 $r2 neg u32 $r1
  77.    long add $r1 (mul u32 $r1 u32 $r0) $r3
  78.    sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
  79.    set $p0 0x1 ge u32 $r1 $r2
  80.    $p0 sub b32 $r1 $r1 $r2
  81.    $p0 add b32 $r0 $r0 0x1
  82.    $p0 set $p0 0x1 ge u32 $r1 $r2
  83.    $p0 sub b32 $r1 $r1 $r2
  84.    long $p0 add b32 $r0 $r0 0x1
  85.    long $p3 cvt s32 $r0 neg s32 $r0
  86.    sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
  87.    $p2 cvt s32 $r1 neg s32 $r1
  88.    long ret
  89.  
  90. // SULDP [for each format]
  91. // $r4d: address
  92. // $r2: surface info (format)
  93. // $p0: access predicate
  94. // $p1, $p2: caching predicate (00: cv, 01: ca, 10: cg)
  95. //
  96. // RGBA32
  97. $p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
  98. set $p1 0x1 $p1 xor not $p2
  99. $p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
  100. $p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
  101. long ret
  102. // RGBA16_UNORM
  103. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  104. $p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
  105. set $p1 0x1 $p1 xor not $p2
  106. $p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
  107. $p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
  108. cvt rn f32 $r3 u16 1 $r1
  109. cvt rn f32 $r2 u16 0 $r1
  110. mul f32 $r3 $r3 0x37800074
  111. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  112. cvt rn f32 $r1 u16 1 $r0
  113. mul f32 $r2 $r2 0x37800074
  114. cvt rn f32 $r0 u16 0 $r0
  115. mul f32 $r1 $r1 0x37800074
  116. mul f32 $r0 $r0 0x37800074
  117. long ret
  118. // RGBA16_SNORM
  119. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  120. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  121. set $p1 0x1 $p1 xor not $p2
  122. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  123. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  124. cvt rn f32 $r3 s16 1 $r1
  125. cvt rn f32 $r2 s16 0 $r1
  126. mul f32 $r3 $r3 0x38000187
  127. cvt rn f32 $r1 s16 1 $r0
  128. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  129. mul f32 $r2 $r2 0x38000187
  130. cvt rn f32 $r0 s16 0 $r0
  131. mul f32 $r1 $r1 0x38000187
  132. mul f32 $r0 $r0 0x38000187
  133. long ret
  134. // RGBA16_SINT
  135. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  136. set $p1 0x1 $p1 xor not $p2
  137. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  138. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  139. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  140. cvt s32 $r3 s16 1 $r1
  141. cvt s32 $r2 s16 0 $r1
  142. cvt s32 $r1 s16 1 $r0
  143. cvt s32 $r0 s16 0 $r0
  144. long ret
  145. // RGBA16_UINT
  146. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  147. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  148. set $p1 0x1 $p1 xor not $p2
  149. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  150. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  151. cvt u32 $r3 u16 1 $r1
  152. cvt u32 $r2 u16 0 $r1
  153. cvt u32 $r1 u16 1 $r0
  154. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  155. cvt u32 $r0 u16 0 $r0
  156. long ret
  157. // RGBA16_FLOAT
  158. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  159. set $p1 0x1 $p1 xor not $p2
  160. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  161. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  162. cvt f32 $r3 f16 $r1 1
  163. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  164. cvt f32 $r2 f16 $r1 0
  165. cvt f32 $r1 f16 $r0 1
  166. cvt f32 $r0 f16 $r0 0
  167. long ret
  168. // RG32_FLOAT
  169. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  170. set $p1 0x1 $p1 xor not $p2
  171. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  172. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  173. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  174. long mov b32 $r2 0x00000000
  175. long mov b32 $r3 0x3f800000
  176. long ret
  177. // RG32_xINT
  178. $p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
  179. set $p1 0x1 $p1 xor not $p2
  180. $p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
  181. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  182. $p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
  183. long mov b32 $r2 0x00000000
  184. long mov b32 $r3 0x00000001
  185. long ret
  186. // RGB10A2_UNORM
  187. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  188. set $p1 0x1 $p1 xor not $p2
  189. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  190. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  191. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  192. ext u32 $r1 $r0 0x0a0a
  193. long mov b32 $r3 0x3f800000
  194. ext u32 $r2 $r0 0x0a14
  195. long and b32 $r0 $r0 0x3ff
  196. cvt rn f32 $r2 u16 0 $r2
  197. cvt rn f32 $r1 u16 0 $r1
  198. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  199. mul f32 $r2 $r2 0x3a802007
  200. cvt rn f32 $r0 u16 0 $r0
  201. mul f32 $r1 $r1 0x3a802007
  202. mul f32 $r0 $r0 0x3a802007
  203. long ret
  204. // RGB10A2_UINT
  205. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  206. set $p1 0x1 $p1 xor not $p2
  207. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  208. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  209. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  210. ext u32 $r1 $r0 0x0a0a
  211. long mov b32 $r3 0x00000001
  212. ext u32 $r2 $r0 0x0a14
  213. long and b32 $r0 $r0 0x3ff
  214. long ret
  215. // RGBA8_UNORM
  216. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  217. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  218. set $p1 0x1 $p1 xor not $p2
  219. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  220. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  221. cvt rn f32 $r3 u8 3 $r0
  222. cvt rn f32 $r2 u8 2 $r0
  223. mul f32 $r3 $r3 0x3b808081
  224. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  225. cvt rn f32 $r1 u8 1 $r0
  226. mul f32 $r2 $r2 0x3b808081
  227. cvt rn f32 $r0 u8 0 $r0
  228. mul f32 $r1 $r1 0x3b808081
  229. mul f32 $r0 $r0 0x3b808081
  230. long ret
  231. // RGBA8_SNORM
  232. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  233. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  234. set $p1 0x1 $p1 xor not $p2
  235. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  236. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  237. cvt rn f32 $r3 s8 3 $r0
  238. cvt rn f32 $r2 s8 2 $r0
  239. mul f32 $r3 $r3 0x3c010204
  240. cvt rn f32 $r1 s8 1 $r0
  241. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  242. mul f32 $r2 $r2 0x3c010204
  243. cvt rn f32 $r0 s8 0 $r0
  244. mul f32 $r1 $r1 0x3c010204
  245. mul f32 $r0 $r0 0x3c010204
  246. long ret
  247. // RGBA8_SINT
  248. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  249. set $p1 0x1 $p1 xor not $p2
  250. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  251. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  252. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  253. cvt s32 $r3 s8 3 $r0
  254. cvt s32 $r2 s8 2 $r0
  255. cvt s32 $r1 s8 1 $r0
  256. cvt s32 $r0 s8 0 $r0
  257. long ret
  258. // RGBA8_UINT
  259. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  260. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  261. set $p1 0x1 $p1 xor not $p2
  262. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  263. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  264. cvt u32 $r3 u8 3 $r0
  265. cvt u32 $r2 u8 2 $r0
  266. cvt u32 $r1 u8 1 $r0
  267. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  268. cvt u32 $r0 u8 0 $r0
  269. long ret
  270. // R5G6B5_UNORM
  271. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  272. set $p1 0x1 $p1 xor not $p2
  273. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  274. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  275. ext u32 $r1 $r0 0x0605
  276. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  277. long mov b32 $r3 0x3f800000
  278. ext u32 $r2 $r0 0x050b
  279. long and b32 $r0 $r0 0x1f
  280. cvt rn f32 $r2 u8 0 $r2
  281. cvt rn f32 $r1 u8 0 $r1
  282. mul f32 $r2 $r2 0x3d042108
  283. cvt rn f32 $r0 u8 0 $r0
  284. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  285. mul f32 $r1 $r1 0x3c820821
  286. mul f32 $r0 $r0 0x3d042108
  287. long ret
  288. // R5G5B5X1_UNORM
  289. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  290. set $p1 0x1 $p1 xor not $p2
  291. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  292. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  293. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  294. ext u32 $r1 $r0 0x0505
  295. ext u32 $r2 $r0 0x050a
  296. long and b32 $r0 $r0 0x1f
  297. long mov b32 $r3 0x3f800000
  298. cvt rn f32 $r2 u8 0 $r2
  299. cvt rn f32 $r1 u8 0 $r1
  300. cvt rn f32 $r0 u8 0 $r0
  301. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  302. mul f32 $r2 $r2 0x3d042108
  303. mul f32 $r1 $r1 0x3d042108
  304. mul f32 $r0 $r0 0x3d042108
  305. long ret
  306. // RG16_UNORM
  307. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  308. set $p1 0x1 $p1 xor not $p2
  309. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  310. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  311. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  312. cvt rn f32 $r1 u16 1 $r0
  313. cvt rn f32 $r0 u16 0 $r0
  314. mul f32 $r1 $r1 0x37800074
  315. mul f32 $r0 $r0 0x37800074
  316. long mov b32 $r2 0x00000000
  317. long mov b32 $r3 0x3f800000
  318. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  319. long ret
  320. // RG16_SNORM
  321. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  322. set $p1 0x1 $p1 xor not $p2
  323. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  324. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  325. mov b32 $r3 0x3f800000
  326. cvt rn f32 $r1 s16 1 $r0
  327. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  328. mov b32 $r2 0x00000000
  329. cvt rn f32 $r0 s16 0 $r0
  330. mul f32 $r1 $r1 0x38000187
  331. mul f32 $r0 $r0 0x38000187
  332. long ret
  333. // RG16_SINT
  334. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  335. set $p1 0x1 $p1 xor not $p2
  336. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  337. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  338. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  339. mov b32 $r3 0x00000001
  340. cvt s32 $r1 s16 1 $r0
  341. mov b32 $r2 0x00000000
  342. cvt s32 $r0 s16 0 $r0
  343. long ret
  344. // RG16_UINT
  345. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  346. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  347. set $p1 0x1 $p1 xor not $p2
  348. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  349. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  350. mov b32 $r3 0x00000001
  351. cvt u32 $r1 u16 1 $r0
  352. mov b32 $r2 0x00000000
  353. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  354. cvt u32 $r0 u16 0 $r0
  355. long ret
  356. // RG16_FLOAT
  357. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  358. set $p1 0x1 $p1 xor not $p2
  359. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  360. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  361. mov b32 $r3 0x3f800000
  362. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  363. cvt f32 $r1 f16 $r0 1
  364. mov b32 $r2 0x00000000
  365. cvt f32 $r0 f16 $r0 0
  366. long ret
  367. // R32_FLOAT
  368. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  369. set $p1 0x1 $p1 xor not $p2
  370. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  371. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  372. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  373. long mov b32 $r3 0x3f800000
  374. long mov b32 $r2 0x00000000
  375. long mov b32 $r1 0x00000000
  376. long ret
  377. // R32_xINT
  378. $p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
  379. set $p1 0x1 $p1 xor not $p2
  380. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  381. $p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
  382. $p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
  383. long mov b32 $r3 0x00000001
  384. long mov b32 $r2 0x00000000
  385. long mov b32 $r1 0x00000000
  386. long ret
  387. // RG8_UNORM
  388. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  389. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  390. set $p1 0x1 $p1 xor not $p2
  391. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  392. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  393. mov b32 $r3 0x3f800000
  394. cvt rn f32 $r1 u8 1 $r0
  395. mov b32 $r2 0x00000000
  396. cvt rn f32 $r0 u8 0 $r0
  397. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  398. mul f32 $r1 $r1 0x3b808081
  399. mul f32 $r0 $r0 0x3b808081
  400. long ret
  401. // RG8_SNORM
  402. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  403. set $p1 0x1 $p1 xor not $p2
  404. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  405. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  406. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  407. long mov b32 $r3 0x3f800000
  408. cvt rn f32 $r1 s8 1 $r0
  409. long mov b32 $r2 0x00000000
  410. cvt rn f32 $r0 s8 0 $r0
  411. mul f32 $r1 $r1 0x3c010204
  412. mul f32 $r0 $r0 0x3c010204
  413. long ret
  414. // RG8_UINT
  415. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  416. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  417. set $p1 0x1 $p1 xor not $p2
  418. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  419. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  420. long mov b32 $r3 0x00000001
  421. cvt u32 $r1 u8 1 $r0
  422. long mov b32 $r2 0x00000000
  423. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  424. cvt u32 $r0 u8 0 $r0
  425. long ret
  426. // RG8_SINT
  427. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  428. set $p1 0x1 $p1 xor not $p2
  429. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  430. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  431. long mov b32 $r3 0x00000001
  432. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  433. cvt s32 $r1 s8 1 $r0
  434. long mov b32 $r2 0x00000000
  435. cvt s32 $r0 s8 0 $r0
  436. long ret
  437. // R16_UNORM
  438. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  439. set $p1 0x1 $p1 xor not $p2
  440. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  441. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  442. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  443. long mov b32 $r3 0x3f800000
  444. cvt rn f32 $r0 u16 0 $r0
  445. long mov b32 $r2 0x00000000
  446. long mov b32 $r1 0x00000000
  447. mul f32 $r0 $r0 0x37800074
  448. long ret
  449. // R16_SNORM
  450. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  451. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  452. set $p1 0x1 $p1 xor not $p2
  453. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  454. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  455. mov b32 $r3 0x3f800000
  456. cvt rn f32 $r0 s16 0 $r0
  457. long mov b32 $r2 0x00000000
  458. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  459. long mov b32 $r1 0x00000000
  460. mul f32 $r0 $r0 0x38000187
  461. long ret
  462. // R16_SINT
  463. $p1 suldgb s16 $r0 ca zero u8 g[$r4d] $r2 $p0
  464. set $p1 0x1 $p1 xor not $p2
  465. $p2 suldgb s16 $r0 cg zero u8 g[$r4d] $r2 $p0
  466. $p1 suldgb s16 $r0 cv zero u8 g[$r4d] $r2 $p0
  467. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  468. long mov b32 $r3 0x00000001
  469. long mov b32 $r2 0x00000000
  470. long mov b32 $r1 0x00000000
  471. long ret
  472. // R16_UINT
  473. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  474. set $p1 0x1 $p1 xor not $p2
  475. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  476. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  477. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  478. long mov b32 $r3 0x00000001
  479. long mov b32 $r2 0x00000000
  480. long mov b32 $r1 0x00000000
  481. long ret
  482. // R16_FLOAT
  483. $p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
  484. set $p1 0x1 $p1 xor not $p2
  485. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  486. $p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
  487. $p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
  488. long mov b32 $r3 0x3f800000
  489. long mov b32 $r2 0x00000000
  490. cvt f32 $r0 f16 $r0 0
  491. mov b32 $r1 0x00000000
  492. long ret
  493. // R8_UNORM
  494. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  495. $p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
  496. set $p1 0x1 $p1 xor not $p2
  497. $p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
  498. $p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
  499. mov b32 $r3 0x3f800000
  500. cvt rn f32 $r0 u8 0 $r0
  501. mov b32 $r2 0x00000000
  502. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  503. mul f32 $r0 $r0 0x3b808081
  504. mov b32 $r1 0x00000000
  505. long ret
  506. // R8_SNORM
  507. $p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
  508. set $p1 0x1 $p1 xor not $p2
  509. $p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
  510. $p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
  511. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  512. mov b32 $r3 0x3f800000
  513. cvt rn f32 $r0 s8 0 $r0
  514. mov b32 $r2 0x00000000
  515. mul f32 $r0 $r0 0x3c010204
  516. mov b32 $r1 0x00000000
  517. long ret
  518. // R8_SINT
  519. $p1 suldgb s8 $r0 ca zero u8 g[$r4d] $r2 $p0
  520. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  521. set $p1 0x1 $p1 xor not $p2
  522. $p2 suldgb s8 $r0 cg zero u8 g[$r4d] $r2 $p0
  523. $p1 suldgb s8 $r0 cv zero u8 g[$r4d] $r2 $p0
  524. long mov b32 $r3 0x00000001
  525. long mov b32 $r2 0x00000000
  526. long mov b32 $r1 0x00000000
  527. long ret
  528. // R8_UINT
  529. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  530. $p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
  531. set $p1 0x1 $p1 xor not $p2
  532. $p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
  533. $p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
  534. long mov b32 $r3 0x00000001
  535. long mov b32 $r2 0x00000000
  536. long mov b32 $r1 0x00000000
  537. sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  538. long ret
  539. // R11G11B10_FLOAT TODO
  540. $p1 suldgb b32 $r3 ca zero u8 g[$r4d] $r2 $p0
  541. set $p1 0x1 $p1 xor not $p2
  542. $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
  543. $p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
  544. long mov b32 $r3 0x3f800000
  545. long nop
  546. long ret
  547.  
  548.  
  549. // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
  550. //
  551. // INPUT:   $r0d (x)
  552. // OUTPUT:  $r0d (rcp(x))
  553. // CLOBBER: $r2 - $r7
  554. // SIZE:    9 * 8 bytes
  555. //
  556. gk104_rcp_f64:
  557.    long nop
  558.    long ret
  559.  
  560. // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
  561. //
  562. // INPUT:   $r0d (x)
  563. // OUTPUT:  $r0d (rsqrt(x))
  564. // CLOBBER: $r2 - $r7
  565. // SIZE:    14 * 8 bytes
  566. //
  567. gk104_rsq_f64:
  568.    long nop
  569.    long ret
  570.  
  571. //
  572. // Trap handler.
  573. // Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.
  574. // Low 32 bytes of l[] memory shouldn't be used if resumeability is required.
  575. //
  576. // Trap info:
  577. // 0x000: mutex
  578. // 0x004: PC
  579. // 0x008: trapstat
  580. // 0x00c: warperr
  581. // 0x010: tidx
  582. // 0x014: tidy
  583. // 0x018: tidz
  584. // 0x01c: ctaidx
  585. // 0x020: ctaidy
  586. // 0x024: ctaidz
  587. // 0x030: $r0q
  588. // 0x130: $flags
  589. // 0x140: s[]
  590. //
  591. st b128 wb l[0x00] $r0q
  592. // check state of the warp and continue if it didn't cause the trap
  593. long mov b32 $r1 $trapstat
  594. long mov b32 $r3 $warperr
  595. mov $r2 $flags mask 0xffff
  596. and b32 0 $c $r1 $r3
  597. e $c bra #end_cont
  598. // spill control flow stack to l[]
  599. long mov b32 $r3 16
  600. spill_cfstack:
  601. preret #end_exit
  602. sub b32 $r3 $c $r3 0x1
  603. lg $c bra #spill_cfstack
  604. // retrieve pointer to trap info
  605. mov b32 $r0 c0[0x1900]
  606. mov b32 $r1 c0[0x1904]
  607. // we only let a single faulting thread store its state
  608. mov b32 $r3 0x1
  609. exch b32 $r3 g[$r0d] $r3
  610. joinat #end_exit
  611. set $p0 0x1 eq u32 $r3 0x1
  612. join $p0 nop
  613. // store $c and $p registers
  614. st b32 wb g[$r0d+0x130] $r2
  615. // store $trapstat and $warperr
  616. long mov b32 $r2 $trapstat
  617. long mov b32 $r3 $warperr
  618. st b64 wb g[$r0d+0x8] $r2d
  619. // store registers
  620. st b128 wb g[$r0d+0x40] $r4q
  621. st b128 wb g[$r0d+0x50] $r8q
  622. st b128 wb g[$r0d+0x60] $r12q
  623. st b128 wb g[$r0d+0x70] $r16q
  624. st b128 wb g[$r0d+0x80] $r20q
  625. st b128 wb g[$r0d+0x90] $r24q
  626. st b128 wb g[$r0d+0xa0] $r28q
  627. st b128 wb g[$r0d+0xb0] $r32q
  628. st b128 wb g[$r0d+0xc0] $r36q
  629. st b128 wb g[$r0d+0xd0] $r40q
  630. st b128 wb g[$r0d+0xe0] $r44q
  631. st b128 wb g[$r0d+0xf0] $r48q
  632. st b128 wb g[$r0d+0x100] $r52q
  633. st b128 wb g[$r0d+0x110] $r56q
  634. st b128 wb g[$r0d+0x120] $r60q
  635. ld b64 $r2d cs l[0x0]
  636. st b64 wb g[$r0d+0x30] $r2d
  637. ld b64 $r2d cs l[0x8]
  638. st b64 wb g[$r0d+0x38] $r2d
  639. // store thread id
  640. long mov b32 $r2 $tidx
  641. long mov b32 $r3 $tidy
  642. st b64 wb g[$r0d+0x10] $r2d
  643. long mov b32 $r2 $tidz
  644. long mov b32 $r3 $ctaidx
  645. st b64 wb g[$r0d+0x18] $r2d
  646. long mov b32 $r2 $ctaidy
  647. long mov b32 $r3 $ctaidz
  648. st b64 wb g[$r0d+0x20] $r2d
  649. // store shared memory (in reverse order so $r0d is base again at the end)
  650. long mov b32 $r3 $smemsz
  651. sub b32 $r3 $c $r3 0x4
  652. s $c bra #shared_done
  653. add b32 $r0 $c $r0 $r3
  654. add b32 $r1 $r1 0x0 $c
  655. shared_loop:
  656. long ld b32 $r2 s[$r3]
  657. long st b32 wb g[$r0d+0x140] $r2
  658. sub b32 $r0 $c $r0 0x4
  659. sub b32 $r1 $r1 0x0 $c
  660. sub b32 $r3 $c $r3 0x4
  661. lg $c bra #shared_loop
  662. shared_done:
  663. // search the stack for trap entry to retrieve PC
  664. mov b32 $r0 c0[0x1908]
  665. mov b32 $r1 c0[0x190c]
  666. membar sys
  667. // invalidate caches so we can read stack entries via g[]
  668. cctl ivall 0 l[0]
  669. cctl ivall 0 g[$r0d]
  670. // get offsets
  671. mov b32 $r2 $physid
  672. ext u32 $r3 $r2 0x0814 // MP id
  673. ext u32 $r2 $r2 0x0608 // warp id
  674. mul $r2 u32 $r2 u32 c0[0x1914] // warp offset
  675. mul $r3 u32 $r3 u32 c0[0x1910] // MP offset
  676. add b32 $r2 $r2 $r3 // MP + warp offset
  677. add b32 $r0 $c $r0 $r2
  678. add b32 $r1 $r1 0x0 $c
  679. search_cstack:
  680. mov b32 $r3 c0[0x1918] // cstack size
  681. ld u8 $r2 cv g[$r0d+0x8]
  682. set $p0 0x1 eq u32 $r2 0xa
  683. $p0 bra #entry_found
  684. add b32 $r0 $c $r0 0x10
  685. add b32 $r1 $r1 0x0 $c
  686. sub b32 $r3 $c $r3 0x10
  687. lg $c bra #search_cstack
  688. bra #end_exit
  689. entry_found:
  690. // load PC (may be unaligned and spread out)
  691. ld b32 $r2 cv g[$r0d]
  692. mov b32 $r0 c0[0x1900]
  693. mov b32 $r1 c0[0x1904]
  694. st b32 wb g[$r0d+0x4] $r2
  695. join nop
  696. // invalidate caches and exit
  697. end_exit:
  698. cctl ivall 0 g[0]
  699. bpt pause 0x0
  700. rtt terminate
  701. end_cont:
  702. bpt pause 0x0
  703. mov $flags $r2 mask 0xffff
  704. ld b128 $r0q cs l[0x00]
  705. rtt
  706.  
  707. .section #gk104_builtin_offsets
  708. .b64 #gk104_div_u32
  709. .b64 #gk104_div_s32
  710. .b64 #gk104_rcp_f64
  711. .b64 #gk104_rsq_f64
  712.