1
.section #gk104_builtin_code
4
// UNR recurrence (q = a / b):
5
// look for z such that 2^32 - b <= b * z < 2^32
6
// then q - 1 <= (a * z) / 2^32 <= q
8
// INPUT: $r0: dividend, $r1: divisor
9
// OUTPUT: $r0: result, $r1: modulus
10
// CLOBBER: $r2 - $r3, $p0 - $p1
11
// SIZE: 22 / 14 * 8 bytes
14
sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28
16
long xor b32 $r2 $r2 0x1f
18
shl b32 $r2 $r3 clamp $r2
19
long cvt u32 $r1 neg u32 $r1
20
long mul $r3 u32 $r1 u32 $r2
21
add $r2 (mul high u32 $r2 u32 $r3) $r2
22
sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
23
mul $r3 u32 $r1 u32 $r2
24
add $r2 (mul high u32 $r2 u32 $r3) $r2
25
mul $r3 u32 $r1 u32 $r2
26
add $r2 (mul high u32 $r2 u32 $r3) $r2
27
mul $r3 u32 $r1 u32 $r2
28
add $r2 (mul high u32 $r2 u32 $r3) $r2
29
mul $r3 u32 $r1 u32 $r2
30
sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4
31
add $r2 (mul high u32 $r2 u32 $r3) $r2
33
mul high $r0 u32 $r0 u32 $r2
34
long cvt u32 $r2 neg u32 $r1
35
long add $r1 (mul u32 $r1 u32 $r0) $r3
36
set $p0 0x1 ge u32 $r1 $r2
37
$p0 sub b32 $r1 $r1 $r2
38
sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20
39
$p0 add b32 $r0 $r0 0x1
40
$p0 set $p0 0x1 ge u32 $r1 $r2
41
$p0 sub b32 $r1 $r1 $r2
42
$p0 add b32 $r0 $r0 0x1
45
// DIV S32, like DIV U32 after taking ABS(inputs)
47
// INPUT: $r0: dividend, $r1: divisor
48
// OUTPUT: $r0: result, $r1: modulus
49
// CLOBBER: $r2 - $r3, $p0 - $p3
52
set $p2 0x1 lt s32 $r0 0x0
53
set $p3 0x1 lt s32 $r1 0x0 xor $p2
54
sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28
55
long cvt s32 $r0 abs s32 $r0
56
long cvt s32 $r1 abs s32 $r1
58
long xor b32 $r2 $r2 0x1f
60
shl b32 $r2 $r3 clamp $r2
61
cvt u32 $r1 neg u32 $r1
62
sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
63
mul $r3 u32 $r1 u32 $r2
64
add $r2 (mul high u32 $r2 u32 $r3) $r2
65
mul $r3 u32 $r1 u32 $r2
66
add $r2 (mul high u32 $r2 u32 $r3) $r2
67
mul $r3 u32 $r1 u32 $r2
68
add $r2 (mul high u32 $r2 u32 $r3) $r2
69
mul $r3 u32 $r1 u32 $r2
70
sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28
71
add $r2 (mul high u32 $r2 u32 $r3) $r2
72
mul $r3 u32 $r1 u32 $r2
73
add $r2 (mul high u32 $r2 u32 $r3) $r2
75
mul high $r0 u32 $r0 u32 $r2
76
long cvt u32 $r2 neg u32 $r1
77
long add $r1 (mul u32 $r1 u32 $r0) $r3
78
sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
79
set $p0 0x1 ge u32 $r1 $r2
80
$p0 sub b32 $r1 $r1 $r2
81
$p0 add b32 $r0 $r0 0x1
82
$p0 set $p0 0x1 ge u32 $r1 $r2
83
$p0 sub b32 $r1 $r1 $r2
84
long $p0 add b32 $r0 $r0 0x1
85
long $p3 cvt s32 $r0 neg s32 $r0
86
sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
87
$p2 cvt s32 $r1 neg s32 $r1
90
// SULDP [for each format]
92
// $r2: surface info (format)
93
// $p0: access predicate
94
// $p1, $p2: caching predicate (00: cv, 01: ca, 10: cg)
97
$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
98
set $p1 0x1 $p1 xor not $p2
99
$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
100
$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
103
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
104
$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
105
set $p1 0x1 $p1 xor not $p2
106
$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
107
$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
108
cvt rn f32 $r3 u16 1 $r1
109
cvt rn f32 $r2 u16 0 $r1
110
mul f32 $r3 $r3 0x37800074
111
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
112
cvt rn f32 $r1 u16 1 $r0
113
mul f32 $r2 $r2 0x37800074
114
cvt rn f32 $r0 u16 0 $r0
115
mul f32 $r1 $r1 0x37800074
116
mul f32 $r0 $r0 0x37800074
119
$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
120
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
121
set $p1 0x1 $p1 xor not $p2
122
$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
123
$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
124
cvt rn f32 $r3 s16 1 $r1
125
cvt rn f32 $r2 s16 0 $r1
126
mul f32 $r3 $r3 0x38000187
127
cvt rn f32 $r1 s16 1 $r0
128
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
129
mul f32 $r2 $r2 0x38000187
130
cvt rn f32 $r0 s16 0 $r0
131
mul f32 $r1 $r1 0x38000187
132
mul f32 $r0 $r0 0x38000187
135
$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
136
set $p1 0x1 $p1 xor not $p2
137
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
138
$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
139
$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
140
cvt s32 $r3 s16 1 $r1
141
cvt s32 $r2 s16 0 $r1
142
cvt s32 $r1 s16 1 $r0
143
cvt s32 $r0 s16 0 $r0
146
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
147
$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
148
set $p1 0x1 $p1 xor not $p2
149
$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
150
$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
151
cvt u32 $r3 u16 1 $r1
152
cvt u32 $r2 u16 0 $r1
153
cvt u32 $r1 u16 1 $r0
154
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
155
cvt u32 $r0 u16 0 $r0
158
$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
159
set $p1 0x1 $p1 xor not $p2
160
$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
161
$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
162
cvt f32 $r3 f16 $r1 1
163
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
164
cvt f32 $r2 f16 $r1 0
165
cvt f32 $r1 f16 $r0 1
166
cvt f32 $r0 f16 $r0 0
169
$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
170
set $p1 0x1 $p1 xor not $p2
171
$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
172
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
173
$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
174
long mov b32 $r2 0x00000000
175
long mov b32 $r3 0x3f800000
178
$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
179
set $p1 0x1 $p1 xor not $p2
180
$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
181
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
182
$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
183
long mov b32 $r2 0x00000000
184
long mov b32 $r3 0x00000001
187
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
188
set $p1 0x1 $p1 xor not $p2
189
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
190
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
191
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
192
ext u32 $r1 $r0 0x0a0a
193
long mov b32 $r3 0x3f800000
194
ext u32 $r2 $r0 0x0a14
195
long and b32 $r0 $r0 0x3ff
196
cvt rn f32 $r2 u16 0 $r2
197
cvt rn f32 $r1 u16 0 $r1
198
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
199
mul f32 $r2 $r2 0x3a802007
200
cvt rn f32 $r0 u16 0 $r0
201
mul f32 $r1 $r1 0x3a802007
202
mul f32 $r0 $r0 0x3a802007
205
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
206
set $p1 0x1 $p1 xor not $p2
207
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
208
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
209
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
210
ext u32 $r1 $r0 0x0a0a
211
long mov b32 $r3 0x00000001
212
ext u32 $r2 $r0 0x0a14
213
long and b32 $r0 $r0 0x3ff
216
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
217
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
218
set $p1 0x1 $p1 xor not $p2
219
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
220
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
221
cvt rn f32 $r3 u8 3 $r0
222
cvt rn f32 $r2 u8 2 $r0
223
mul f32 $r3 $r3 0x3b808081
224
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
225
cvt rn f32 $r1 u8 1 $r0
226
mul f32 $r2 $r2 0x3b808081
227
cvt rn f32 $r0 u8 0 $r0
228
mul f32 $r1 $r1 0x3b808081
229
mul f32 $r0 $r0 0x3b808081
232
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
233
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
234
set $p1 0x1 $p1 xor not $p2
235
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
236
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
237
cvt rn f32 $r3 s8 3 $r0
238
cvt rn f32 $r2 s8 2 $r0
239
mul f32 $r3 $r3 0x3c010204
240
cvt rn f32 $r1 s8 1 $r0
241
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
242
mul f32 $r2 $r2 0x3c010204
243
cvt rn f32 $r0 s8 0 $r0
244
mul f32 $r1 $r1 0x3c010204
245
mul f32 $r0 $r0 0x3c010204
248
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
249
set $p1 0x1 $p1 xor not $p2
250
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
251
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
252
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
259
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
260
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
261
set $p1 0x1 $p1 xor not $p2
262
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
263
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
267
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
271
$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
272
set $p1 0x1 $p1 xor not $p2
273
$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
274
$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
275
ext u32 $r1 $r0 0x0605
276
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
277
long mov b32 $r3 0x3f800000
278
ext u32 $r2 $r0 0x050b
279
long and b32 $r0 $r0 0x1f
280
cvt rn f32 $r2 u8 0 $r2
281
cvt rn f32 $r1 u8 0 $r1
282
mul f32 $r2 $r2 0x3d042108
283
cvt rn f32 $r0 u8 0 $r0
284
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
285
mul f32 $r1 $r1 0x3c820821
286
mul f32 $r0 $r0 0x3d042108
289
$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
290
set $p1 0x1 $p1 xor not $p2
291
$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
292
$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
293
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
294
ext u32 $r1 $r0 0x0505
295
ext u32 $r2 $r0 0x050a
296
long and b32 $r0 $r0 0x1f
297
long mov b32 $r3 0x3f800000
298
cvt rn f32 $r2 u8 0 $r2
299
cvt rn f32 $r1 u8 0 $r1
300
cvt rn f32 $r0 u8 0 $r0
301
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
302
mul f32 $r2 $r2 0x3d042108
303
mul f32 $r1 $r1 0x3d042108
304
mul f32 $r0 $r0 0x3d042108
307
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
308
set $p1 0x1 $p1 xor not $p2
309
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
310
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
311
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
312
cvt rn f32 $r1 u16 1 $r0
313
cvt rn f32 $r0 u16 0 $r0
314
mul f32 $r1 $r1 0x37800074
315
mul f32 $r0 $r0 0x37800074
316
long mov b32 $r2 0x00000000
317
long mov b32 $r3 0x3f800000
318
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
321
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
322
set $p1 0x1 $p1 xor not $p2
323
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
324
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
325
mov b32 $r3 0x3f800000
326
cvt rn f32 $r1 s16 1 $r0
327
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
328
mov b32 $r2 0x00000000
329
cvt rn f32 $r0 s16 0 $r0
330
mul f32 $r1 $r1 0x38000187
331
mul f32 $r0 $r0 0x38000187
334
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
335
set $p1 0x1 $p1 xor not $p2
336
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
337
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
338
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
339
mov b32 $r3 0x00000001
340
cvt s32 $r1 s16 1 $r0
341
mov b32 $r2 0x00000000
342
cvt s32 $r0 s16 0 $r0
345
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
346
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
347
set $p1 0x1 $p1 xor not $p2
348
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
349
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
350
mov b32 $r3 0x00000001
351
cvt u32 $r1 u16 1 $r0
352
mov b32 $r2 0x00000000
353
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
354
cvt u32 $r0 u16 0 $r0
357
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
358
set $p1 0x1 $p1 xor not $p2
359
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
360
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
361
mov b32 $r3 0x3f800000
362
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
363
cvt f32 $r1 f16 $r0 1
364
mov b32 $r2 0x00000000
365
cvt f32 $r0 f16 $r0 0
368
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
369
set $p1 0x1 $p1 xor not $p2
370
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
371
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
372
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
373
long mov b32 $r3 0x3f800000
374
long mov b32 $r2 0x00000000
375
long mov b32 $r1 0x00000000
378
$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
379
set $p1 0x1 $p1 xor not $p2
380
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
381
$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
382
$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
383
long mov b32 $r3 0x00000001
384
long mov b32 $r2 0x00000000
385
long mov b32 $r1 0x00000000
388
$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
389
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
390
set $p1 0x1 $p1 xor not $p2
391
$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
392
$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
393
mov b32 $r3 0x3f800000
394
cvt rn f32 $r1 u8 1 $r0
395
mov b32 $r2 0x00000000
396
cvt rn f32 $r0 u8 0 $r0
397
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
398
mul f32 $r1 $r1 0x3b808081
399
mul f32 $r0 $r0 0x3b808081
402
$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
403
set $p1 0x1 $p1 xor not $p2
404
$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
405
$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
406
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
407
long mov b32 $r3 0x3f800000
408
cvt rn f32 $r1 s8 1 $r0
409
long mov b32 $r2 0x00000000
410
cvt rn f32 $r0 s8 0 $r0
411
mul f32 $r1 $r1 0x3c010204
412
mul f32 $r0 $r0 0x3c010204
415
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
416
$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
417
set $p1 0x1 $p1 xor not $p2
418
$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
419
$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
420
long mov b32 $r3 0x00000001
422
long mov b32 $r2 0x00000000
423
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
427
$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
428
set $p1 0x1 $p1 xor not $p2
429
$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
430
$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
431
long mov b32 $r3 0x00000001
432
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
434
long mov b32 $r2 0x00000000
438
$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
439
set $p1 0x1 $p1 xor not $p2
440
$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
441
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
442
$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
443
long mov b32 $r3 0x3f800000
444
cvt rn f32 $r0 u16 0 $r0
445
long mov b32 $r2 0x00000000
446
long mov b32 $r1 0x00000000
447
mul f32 $r0 $r0 0x37800074
450
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
451
$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
452
set $p1 0x1 $p1 xor not $p2
453
$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
454
$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
455
mov b32 $r3 0x3f800000
456
cvt rn f32 $r0 s16 0 $r0
457
long mov b32 $r2 0x00000000
458
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
459
long mov b32 $r1 0x00000000
460
mul f32 $r0 $r0 0x38000187
463
$p1 suldgb s16 $r0 ca zero u8 g[$r4d] $r2 $p0
464
set $p1 0x1 $p1 xor not $p2
465
$p2 suldgb s16 $r0 cg zero u8 g[$r4d] $r2 $p0
466
$p1 suldgb s16 $r0 cv zero u8 g[$r4d] $r2 $p0
467
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
468
long mov b32 $r3 0x00000001
469
long mov b32 $r2 0x00000000
470
long mov b32 $r1 0x00000000
473
$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
474
set $p1 0x1 $p1 xor not $p2
475
$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
476
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
477
$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
478
long mov b32 $r3 0x00000001
479
long mov b32 $r2 0x00000000
480
long mov b32 $r1 0x00000000
483
$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
484
set $p1 0x1 $p1 xor not $p2
485
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
486
$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
487
$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
488
long mov b32 $r3 0x3f800000
489
long mov b32 $r2 0x00000000
490
cvt f32 $r0 f16 $r0 0
491
mov b32 $r1 0x00000000
494
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
495
$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
496
set $p1 0x1 $p1 xor not $p2
497
$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
498
$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
499
mov b32 $r3 0x3f800000
500
cvt rn f32 $r0 u8 0 $r0
501
mov b32 $r2 0x00000000
502
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
503
mul f32 $r0 $r0 0x3b808081
504
mov b32 $r1 0x00000000
507
$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
508
set $p1 0x1 $p1 xor not $p2
509
$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
510
$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
511
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
512
mov b32 $r3 0x3f800000
513
cvt rn f32 $r0 s8 0 $r0
514
mov b32 $r2 0x00000000
515
mul f32 $r0 $r0 0x3c010204
516
mov b32 $r1 0x00000000
519
$p1 suldgb s8 $r0 ca zero u8 g[$r4d] $r2 $p0
520
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
521
set $p1 0x1 $p1 xor not $p2
522
$p2 suldgb s8 $r0 cg zero u8 g[$r4d] $r2 $p0
523
$p1 suldgb s8 $r0 cv zero u8 g[$r4d] $r2 $p0
524
long mov b32 $r3 0x00000001
525
long mov b32 $r2 0x00000000
526
long mov b32 $r1 0x00000000
529
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
530
$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
531
set $p1 0x1 $p1 xor not $p2
532
$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
533
$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
534
long mov b32 $r3 0x00000001
535
long mov b32 $r2 0x00000000
536
long mov b32 $r1 0x00000000
537
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
539
// R11G11B10_FLOAT TODO
540
$p1 suldgb b32 $r3 ca zero u8 g[$r4d] $r2 $p0
541
set $p1 0x1 $p1 xor not $p2
542
$p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
543
$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
544
long mov b32 $r3 0x3f800000
546
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
551
// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
554
// OUTPUT: $r0d (rcp(x))
555
// CLOBBER: $r2 - $r7
559
// Step 1: classify input according to exponent and value, and calculate
560
// result for 0/inf/nan. $r2 holds the exponent value, which starts at
561
// bit 52 (bit 20 of the upper half) and is 11 bits in length
562
ext u32 $r2 $r1 0xb14
563
add b32 $r3 $r2 0xffffffff
565
// We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
566
// denorm, or 0). Do this by subtracting 1 from the exponent, which will
567
// mean that it's > 0x7fd in those cases when doing unsigned comparison
568
set $p0 0x1 gt u32 $r3 0x7fd
569
// $r3: 0 for norms, 0x36 for denorms, -1 for others
571
sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
573
// Process all special values: NaN, inf, denorm, 0
574
mov b32 $r3 0xffffffff
575
// A number is NaN if its abs value is greater than or unordered with inf
576
set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
577
(not $p0) bra #rcp_inf_or_denorm_or_zero
578
// NaN -> NaN, the next line sets the "quiet" bit of the result. This
579
// behavior is both seen on the CPU and the blob
580
join or b32 $r1 $r1 0x80000
581
rcp_inf_or_denorm_or_zero:
582
and b32 $r4 $r1 0x7ff00000
583
// Other values with nonzero in exponent field should be inf
584
set $p0 0x1 eq s32 $r4 0x0
585
sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
586
$p0 bra #rcp_denorm_or_zero
588
xor b32 $r1 $r1 0x7ff00000
591
set $p0 0x1 gtu f64 abs $r0d 0x0
594
join or b32 $r1 $r1 0x7ff00000
596
// non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
597
mul rn f64 $r0d $r0d 0x4350000000000000
598
sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
599
join mov b32 $r3 0x36
601
// All numbers with -1 in $r3 have their result ready in $r0d, return them
602
// others need further calculation
603
set $p0 0x1 lt s32 $r3 0x0
605
// Step 2: Before the real calculation goes on, renormalize the values to
606
// range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
607
// result in $r6d. The exponent will be recovered later.
608
ext u32 $r2 $r1 0xb14
609
and b32 $r7 $r1 0x800fffff
610
add b32 $r7 $r7 0x3ff00000
612
sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
613
// Step 3: Convert new value to float (no overflow will occur due to step
614
// 2), calculate rcp and do newton-raphson step once
615
cvt rz f32 $r5 f64 $r6d
617
mov b32 $r0 0xbf800000
618
fma rn f32 $r5 $r4 $r5 $r0
619
fma rn f32 $r0 neg $r4 $r5 $r4
620
// Step 4: convert result $r0 back to double, do newton-raphson steps
622
cvt f64 $r6d neg f64 $r6d
623
sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
624
cvt f64 $r8d f32 0x3f800000
625
// 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
626
// The formula used here (and above) is:
627
// RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
628
// The following code uses 2 FMAs for each step, and it will basically
630
// tmp = -src * RCP_{n} + 1
631
// RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
632
fma rn f64 $r4d $r6d $r0d $r8d
633
fma rn f64 $r0d $r0d $r4d $r0d
634
fma rn f64 $r4d $r6d $r0d $r8d
635
fma rn f64 $r0d $r0d $r4d $r0d
636
fma rn f64 $r4d $r6d $r0d $r8d
637
fma rn f64 $r0d $r0d $r4d $r0d
638
sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
639
fma rn f64 $r4d $r6d $r0d $r8d
640
fma rn f64 $r0d $r0d $r4d $r0d
641
// Step 5: Exponent recovery and final processing
642
// The exponent is recovered by adding what we added to the exponent.
643
// Suppose we want to calculate rcp(x), but we have rcp(cx), then
644
// rcp(x) = c * rcp(cx)
645
// The delta in exponent comes from two sources:
646
// 1) The renormalization in step 2. The delta is:
648
// 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
650
// These 2 sources are calculated in the first two lines below, and then
651
// added to the exponent extracted from the result above.
652
// Note that after processing, the new exponent may >= 0x7ff (inf)
653
// or <= 0 (denorm). Those cases will be handled respectively below
654
subr b32 $r2 $r2 0x3ff
655
long add b32 $r4 $r2 $r3
656
ext u32 $r3 $r1 0xb14
657
// New exponent in $r3
658
long add b32 $r3 $r3 $r4
659
add b32 $r2 $r3 0xffffffff
660
sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
661
// (exponent-1) < 0x7fe (unsigned) means the result is in norm range
662
// (same logic as in step 1)
663
set $p0 0x1 lt u32 $r2 0x7fe
664
(not $p0) bra #rcp_result_inf_or_denorm
665
// Norms: convert exponents back and return
666
shl b32 $r4 $r4 clamp 0x14
667
long add b32 $r1 $r4 $r1
669
rcp_result_inf_or_denorm:
670
// New exponent >= 0x7ff means that result is inf
671
set $p0 0x1 ge s32 $r3 0x7ff
672
(not $p0) bra #rcp_result_denorm
673
sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
675
and b32 $r1 $r1 0x80000000
677
add b32 $r1 $r1 0x7ff00000
680
// Denorm result comes from huge input. The greatest possible fp64, i.e.
681
// 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
682
// normal value. Other rcp result should be greater than that. If we
683
// set the exponent field to 1, we can recover the result by multiplying
684
// it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
685
// 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
687
set $p0 0x1 ne u32 $r3 0x0
688
and b32 $r1 $r1 0x800fffff
690
$p0 cvt f64 $r6d f32 0x3e800000
691
sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
693
(not $p0) cvt f64 $r6d f32 0x3f000000
694
add b32 $r1 $r1 0x00100000
695
mul rn f64 $r0d $r0d $r6d
699
// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
702
// OUTPUT: $r0d (rsqrt(x))
703
// CLOBBER: $r2 - $r7
704
// SIZE: 14 * 8 bytes
707
// Before getting initial result rsqrt64h, two special cases should be
709
// 1. NaN: set the highest bit in mantissa so it'll be surely recognized
710
// as NaN in rsqrt64h
711
set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
712
$p0 or b32 $r1 $r1 0x00080000
713
and b32 $r2 $r1 0x7fffffff
714
sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
715
// 2. denorms and small normal values: using their original value will
716
// lose precision either at rsqrt64h or the first step in newton-raphson
717
// steps below. Take 2 as a threshold in exponent field, and multiply
718
// with 2^54 if the exponent is smaller or equal. (will multiply 2^27
719
// to recover in the end)
720
ext u32 $r3 $r1 0xb14
721
set $p1 0x1 le u32 $r3 0x2
722
long or b32 $r2 $r0 $r2
723
$p1 mul rn f64 $r0d $r0d 0x4350000000000000
725
// rsqrt64h will give correct result for 0/inf/nan, the following logic
726
// checks whether the input is one of those (exponent is 0x7ff or all 0
727
// except for the sign bit)
728
set b32 $r6 ne u32 $r3 0x7ff
729
long and b32 $r2 $r2 $r6
730
sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
731
set $p0 0x1 ne u32 $r2 0x0
733
// For 0/inf/nan, make sure the sign bit agrees with input and return
734
and b32 $r1 $r1 0x80000000
736
long or b32 $r1 $r1 $r5
739
// For others, do 4 Newton-Raphson steps with the formula:
740
// RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
741
// In the code below, each step is written as:
742
// tmp1 = 0.5 * x * RSQ_{n}
743
// tmp2 = -RSQ_{n} * tmp1 + 0.5
744
// RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
746
sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
748
cvt f64 $r8d f32 0x3f000000
749
mul rn f64 $r2d $r0d $r8d
750
mul rn f64 $r0d $r2d $r4d
751
fma rn f64 $r6d neg $r4d $r0d $r8d
752
fma rn f64 $r4d $r4d $r6d $r4d
753
mul rn f64 $r0d $r2d $r4d
754
fma rn f64 $r6d neg $r4d $r0d $r8d
755
sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
756
fma rn f64 $r4d $r4d $r6d $r4d
757
mul rn f64 $r0d $r2d $r4d
758
fma rn f64 $r6d neg $r4d $r0d $r8d
759
fma rn f64 $r4d $r4d $r6d $r4d
760
mul rn f64 $r0d $r2d $r4d
761
fma rn f64 $r6d neg $r4d $r0d $r8d
762
fma rn f64 $r4d $r4d $r6d $r4d
763
sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
764
// Multiply 2^27 to result for small inputs to recover
765
$p1 mul rn f64 $r4d $r4d 0x41a0000000000000
772
// Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.
773
// Low 32 bytes of l[] memory shouldn't be used if resumability is required.
790
st b128 wb l[0x00] $r0q
791
// check state of the warp and continue if it didn't cause the trap
792
long mov b32 $r1 $trapstat
793
long mov b32 $r3 $warperr
794
mov $r2 $flags mask 0xffff
797
// spill control flow stack to l[]
801
sub b32 $r3 $c $r3 0x1
802
lg $c bra #spill_cfstack
803
// retrieve pointer to trap info
804
mov b32 $r0 c0[0x1900]
805
mov b32 $r1 c0[0x1904]
806
// we only let a single faulting thread store its state
808
exch b32 $r3 g[$r0d] $r3
810
set $p0 0x1 eq u32 $r3 0x1
812
// store $c and $p registers
813
st b32 wb g[$r0d+0x130] $r2
814
// store $trapstat and $warperr
815
long mov b32 $r2 $trapstat
816
long mov b32 $r3 $warperr
817
st b64 wb g[$r0d+0x8] $r2d
819
st b128 wb g[$r0d+0x40] $r4q
820
st b128 wb g[$r0d+0x50] $r8q
821
st b128 wb g[$r0d+0x60] $r12q
822
st b128 wb g[$r0d+0x70] $r16q
823
st b128 wb g[$r0d+0x80] $r20q
824
st b128 wb g[$r0d+0x90] $r24q
825
st b128 wb g[$r0d+0xa0] $r28q
826
st b128 wb g[$r0d+0xb0] $r32q
827
st b128 wb g[$r0d+0xc0] $r36q
828
st b128 wb g[$r0d+0xd0] $r40q
829
st b128 wb g[$r0d+0xe0] $r44q
830
st b128 wb g[$r0d+0xf0] $r48q
831
st b128 wb g[$r0d+0x100] $r52q
832
st b128 wb g[$r0d+0x110] $r56q
833
st b128 wb g[$r0d+0x120] $r60q
834
ld b64 $r2d cs l[0x0]
835
st b64 wb g[$r0d+0x30] $r2d
836
ld b64 $r2d cs l[0x8]
837
st b64 wb g[$r0d+0x38] $r2d
839
long mov b32 $r2 $tidx
840
long mov b32 $r3 $tidy
841
st b64 wb g[$r0d+0x10] $r2d
842
long mov b32 $r2 $tidz
843
long mov b32 $r3 $ctaidx
844
st b64 wb g[$r0d+0x18] $r2d
845
long mov b32 $r2 $ctaidy
846
long mov b32 $r3 $ctaidz
847
st b64 wb g[$r0d+0x20] $r2d
848
// store shared memory (in reverse order so $r0d is base again at the end)
849
long mov b32 $r3 $smemsz
850
sub b32 $r3 $c $r3 0x4
851
s $c bra #shared_done
852
add b32 $r0 $c $r0 $r3
853
add b32 $r1 $r1 0x0 $c
855
long ld b32 $r2 s[$r3]
856
long st b32 wb g[$r0d+0x140] $r2
857
sub b32 $r0 $c $r0 0x4
858
sub b32 $r1 $r1 0x0 $c
859
sub b32 $r3 $c $r3 0x4
860
lg $c bra #shared_loop
862
// search the stack for trap entry to retrieve PC
863
mov b32 $r0 c0[0x1908]
864
mov b32 $r1 c0[0x190c]
866
// invalidate caches so we can read stack entries via g[]
871
ext u32 $r3 $r2 0x0814 // MP id
872
ext u32 $r2 $r2 0x0608 // warp id
873
mul $r2 u32 $r2 u32 c0[0x1914] // warp offset
874
mul $r3 u32 $r3 u32 c0[0x1910] // MP offset
875
add b32 $r2 $r2 $r3 // MP + warp offset
876
add b32 $r0 $c $r0 $r2
877
add b32 $r1 $r1 0x0 $c
879
mov b32 $r3 c0[0x1918] // cstack size
880
ld u8 $r2 cv g[$r0d+0x8]
881
set $p0 0x1 eq u32 $r2 0xa
883
add b32 $r0 $c $r0 0x10
884
add b32 $r1 $r1 0x0 $c
885
sub b32 $r3 $c $r3 0x10
886
lg $c bra #search_cstack
889
// load PC (may be unaligned and spread out)
890
ld b32 $r2 cv g[$r0d]
891
mov b32 $r0 c0[0x1900]
892
mov b32 $r1 c0[0x1904]
893
st b32 wb g[$r0d+0x4] $r2
895
// invalidate caches and exit
902
mov $flags $r2 mask 0xffff
903
ld b128 $r0q cs l[0x00]
906
.section #gk104_builtin_offsets