Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Copyright © 2010 Intel Corporation |
||
3 | * |
||
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
5 | * copy of this software and associated documentation files (the "Software"), |
||
6 | * to deal in the Software without restriction, including without limitation |
||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||
8 | * and/or sell copies of the Software, and to permit persons to whom the |
||
9 | * Software is furnished to do so, subject to the following conditions: |
||
10 | * |
||
11 | * The above copyright notice and this permission notice (including the next |
||
12 | * paragraph) shall be included in all copies or substantial portions of the |
||
13 | * Software. |
||
14 | * |
||
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
||
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
||
21 | * IN THE SOFTWARE. |
||
22 | */ |
||
23 | |||
24 | /** @file brw_fs_generator.cpp |
||
25 | * |
||
26 | * This file supports generating code from the FS LIR to the actual |
||
27 | * native instructions. |
||
28 | */ |
||
29 | |||
30 | #include "main/macros.h" |
||
31 | #include "brw_context.h" |
||
32 | #include "brw_eu.h" |
||
33 | #include "brw_fs.h" |
||
34 | #include "brw_cfg.h" |
||
35 | |||
36 | static uint32_t brw_file_from_reg(fs_reg *reg) |
||
37 | { |
||
38 | switch (reg->file) { |
||
39 | case GRF: |
||
40 | return BRW_GENERAL_REGISTER_FILE; |
||
41 | case MRF: |
||
42 | return BRW_MESSAGE_REGISTER_FILE; |
||
43 | case IMM: |
||
44 | return BRW_IMMEDIATE_VALUE; |
||
45 | default: |
||
46 | unreachable("not reached"); |
||
47 | } |
||
48 | } |
||
49 | |||
50 | static struct brw_reg |
||
51 | brw_reg_from_fs_reg(fs_reg *reg) |
||
52 | { |
||
53 | struct brw_reg brw_reg; |
||
54 | |||
55 | switch (reg->file) { |
||
56 | case GRF: |
||
57 | case MRF: |
||
58 | if (reg->stride == 0) { |
||
59 | brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0); |
||
60 | } else if (reg->width < 8) { |
||
61 | brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); |
||
62 | brw_reg = stride(brw_reg, reg->width * reg->stride, |
||
63 | reg->width, reg->stride); |
||
64 | } else { |
||
65 | /* From the Haswell PRM: |
||
66 | * |
||
67 | * VertStride must be used to cross GRF register boundaries. This |
||
68 | * rule implies that elements within a 'Width' cannot cross GRF |
||
69 | * boundaries. |
||
70 | * |
||
71 | * So, for registers with width > 8, we have to use a width of 8 |
||
72 | * and trust the compression state to sort out the exec size. |
||
73 | */ |
||
74 | brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); |
||
75 | brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride); |
||
76 | } |
||
77 | |||
78 | brw_reg = retype(brw_reg, reg->type); |
||
79 | brw_reg = byte_offset(brw_reg, reg->subreg_offset); |
||
80 | break; |
||
81 | case IMM: |
||
82 | switch (reg->type) { |
||
83 | case BRW_REGISTER_TYPE_F: |
||
84 | brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f); |
||
85 | break; |
||
86 | case BRW_REGISTER_TYPE_D: |
||
87 | brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d); |
||
88 | break; |
||
89 | case BRW_REGISTER_TYPE_UD: |
||
90 | brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud); |
||
91 | break; |
||
92 | case BRW_REGISTER_TYPE_W: |
||
93 | brw_reg = brw_imm_w(reg->fixed_hw_reg.dw1.d); |
||
94 | break; |
||
95 | case BRW_REGISTER_TYPE_UW: |
||
96 | brw_reg = brw_imm_uw(reg->fixed_hw_reg.dw1.ud); |
||
97 | break; |
||
98 | case BRW_REGISTER_TYPE_VF: |
||
99 | brw_reg = brw_imm_vf(reg->fixed_hw_reg.dw1.ud); |
||
100 | break; |
||
101 | default: |
||
102 | unreachable("not reached"); |
||
103 | } |
||
104 | break; |
||
105 | case HW_REG: |
||
106 | assert(reg->type == reg->fixed_hw_reg.type); |
||
107 | brw_reg = reg->fixed_hw_reg; |
||
108 | break; |
||
109 | case BAD_FILE: |
||
110 | /* Probably unused. */ |
||
111 | brw_reg = brw_null_reg(); |
||
112 | break; |
||
113 | default: |
||
114 | unreachable("not reached"); |
||
115 | } |
||
116 | if (reg->abs) |
||
117 | brw_reg = brw_abs(brw_reg); |
||
118 | if (reg->negate) |
||
119 | brw_reg = negate(brw_reg); |
||
120 | |||
121 | return brw_reg; |
||
122 | } |
||
123 | |||
124 | fs_generator::fs_generator(struct brw_context *brw, |
||
125 | void *mem_ctx, |
||
126 | const void *key, |
||
127 | struct brw_stage_prog_data *prog_data, |
||
128 | struct gl_program *prog, |
||
129 | unsigned promoted_constants, |
||
130 | bool runtime_check_aads_emit, |
||
131 | const char *stage_abbrev) |
||
132 | |||
133 | : brw(brw), devinfo(brw->intelScreen->devinfo), key(key), |
||
134 | prog_data(prog_data), |
||
135 | prog(prog), promoted_constants(promoted_constants), |
||
136 | runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), |
||
137 | stage_abbrev(stage_abbrev), mem_ctx(mem_ctx) |
||
138 | { |
||
139 | p = rzalloc(mem_ctx, struct brw_codegen); |
||
140 | brw_init_codegen(devinfo, p, mem_ctx); |
||
141 | } |
||
142 | |||
143 | fs_generator::~fs_generator() |
||
144 | { |
||
145 | } |
||
146 | |||
147 | class ip_record : public exec_node { |
||
148 | public: |
||
149 | DECLARE_RALLOC_CXX_OPERATORS(ip_record) |
||
150 | |||
151 | ip_record(int ip) |
||
152 | { |
||
153 | this->ip = ip; |
||
154 | } |
||
155 | |||
156 | int ip; |
||
157 | }; |
||
158 | |||
159 | bool |
||
160 | fs_generator::patch_discard_jumps_to_fb_writes() |
||
161 | { |
||
162 | if (devinfo->gen < 6 || this->discard_halt_patches.is_empty()) |
||
163 | return false; |
||
164 | |||
165 | int scale = brw_jump_scale(p->devinfo); |
||
166 | |||
167 | /* There is a somewhat strange undocumented requirement of using |
||
168 | * HALT, according to the simulator. If some channel has HALTed to |
||
169 | * a particular UIP, then by the end of the program, every channel |
||
170 | * must have HALTed to that UIP. Furthermore, the tracking is a |
||
171 | * stack, so you can't do the final halt of a UIP after starting |
||
172 | * halting to a new UIP. |
||
173 | * |
||
174 | * Symptoms of not emitting this instruction on actual hardware |
||
175 | * included GPU hangs and sparkly rendering on the piglit discard |
||
176 | * tests. |
||
177 | */ |
||
178 | brw_inst *last_halt = gen6_HALT(p); |
||
179 | brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); |
||
180 | brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); |
||
181 | |||
182 | int ip = p->nr_insn; |
||
183 | |||
184 | foreach_in_list(ip_record, patch_ip, &discard_halt_patches) { |
||
185 | brw_inst *patch = &p->store[patch_ip->ip]; |
||
186 | |||
187 | assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT); |
||
188 | /* HALT takes a half-instruction distance from the pre-incremented IP. */ |
||
189 | brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); |
||
190 | } |
||
191 | |||
192 | this->discard_halt_patches.make_empty(); |
||
193 | return true; |
||
194 | } |
||
195 | |||
196 | void |
||
197 | fs_generator::fire_fb_write(fs_inst *inst, |
||
198 | struct brw_reg payload, |
||
199 | struct brw_reg implied_header, |
||
200 | GLuint nr) |
||
201 | { |
||
202 | uint32_t msg_control; |
||
203 | |||
204 | brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
||
205 | |||
206 | if (devinfo->gen < 6) { |
||
207 | brw_push_insn_state(p); |
||
208 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
209 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
210 | brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); |
||
211 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
212 | brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0)); |
||
213 | brw_pop_insn_state(p); |
||
214 | } |
||
215 | |||
216 | if (inst->opcode == FS_OPCODE_REP_FB_WRITE) |
||
217 | msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED; |
||
218 | else if (prog_data->dual_src_blend) { |
||
219 | if (dispatch_width == 8 || !inst->eot) |
||
220 | msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; |
||
221 | else |
||
222 | msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23; |
||
223 | } else if (dispatch_width == 16) |
||
224 | msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; |
||
225 | else |
||
226 | msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; |
||
227 | |||
228 | uint32_t surf_index = |
||
229 | prog_data->binding_table.render_target_start + inst->target; |
||
230 | |||
231 | bool last_render_target = inst->eot || |
||
232 | (prog_data->dual_src_blend && dispatch_width == 16); |
||
233 | |||
234 | |||
235 | brw_fb_WRITE(p, |
||
236 | dispatch_width, |
||
237 | payload, |
||
238 | implied_header, |
||
239 | msg_control, |
||
240 | surf_index, |
||
241 | nr, |
||
242 | 0, |
||
243 | inst->eot, |
||
244 | last_render_target, |
||
245 | inst->header_size != 0); |
||
246 | |||
247 | brw_mark_surface_used(&prog_data->base, surf_index); |
||
248 | } |
||
249 | |||
250 | void |
||
251 | fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) |
||
252 | { |
||
253 | brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
||
254 | const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key; |
||
255 | struct brw_reg implied_header; |
||
256 | |||
257 | if (devinfo->gen < 8 && !devinfo->is_haswell) { |
||
258 | brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); |
||
259 | } |
||
260 | |||
261 | if (inst->base_mrf >= 0) |
||
262 | payload = brw_message_reg(inst->base_mrf); |
||
263 | |||
264 | /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied |
||
265 | * move, here's g1. |
||
266 | */ |
||
267 | if (inst->header_size != 0) { |
||
268 | brw_push_insn_state(p); |
||
269 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
270 | brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); |
||
271 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
272 | brw_set_default_flag_reg(p, 0, 0); |
||
273 | |||
274 | /* On HSW, the GPU will use the predicate on SENDC, unless the header is |
||
275 | * present. |
||
276 | */ |
||
277 | if (prog_data->uses_kill) { |
||
278 | struct brw_reg pixel_mask; |
||
279 | |||
280 | if (devinfo->gen >= 6) |
||
281 | pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); |
||
282 | else |
||
283 | pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); |
||
284 | |||
285 | brw_MOV(p, pixel_mask, brw_flag_reg(0, 1)); |
||
286 | } |
||
287 | |||
288 | if (devinfo->gen >= 6) { |
||
289 | brw_push_insn_state(p); |
||
290 | brw_set_default_exec_size(p, BRW_EXECUTE_16); |
||
291 | brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
||
292 | brw_MOV(p, |
||
293 | retype(payload, BRW_REGISTER_TYPE_UD), |
||
294 | retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); |
||
295 | brw_pop_insn_state(p); |
||
296 | |||
297 | if (inst->target > 0 && key->replicate_alpha) { |
||
298 | /* Set "Source0 Alpha Present to RenderTarget" bit in message |
||
299 | * header. |
||
300 | */ |
||
301 | brw_OR(p, |
||
302 | vec1(retype(payload, BRW_REGISTER_TYPE_UD)), |
||
303 | vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)), |
||
304 | brw_imm_ud(0x1 << 11)); |
||
305 | } |
||
306 | |||
307 | if (inst->target > 0) { |
||
308 | /* Set the render target index for choosing BLEND_STATE. */ |
||
309 | brw_MOV(p, retype(vec1(suboffset(payload, 2)), |
||
310 | BRW_REGISTER_TYPE_UD), |
||
311 | brw_imm_ud(inst->target)); |
||
312 | } |
||
313 | |||
314 | implied_header = brw_null_reg(); |
||
315 | } else { |
||
316 | implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); |
||
317 | } |
||
318 | |||
319 | brw_pop_insn_state(p); |
||
320 | } else { |
||
321 | implied_header = brw_null_reg(); |
||
322 | } |
||
323 | |||
324 | if (!runtime_check_aads_emit) { |
||
325 | fire_fb_write(inst, payload, implied_header, inst->mlen); |
||
326 | } else { |
||
327 | /* This can only happen in gen < 6 */ |
||
328 | assert(devinfo->gen < 6); |
||
329 | |||
330 | struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); |
||
331 | |||
332 | /* Check runtime bit to detect if we have to send AA data or not */ |
||
333 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
334 | brw_AND(p, |
||
335 | v1_null_ud, |
||
336 | retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD), |
||
337 | brw_imm_ud(1<<26)); |
||
338 | brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); |
||
339 | |||
340 | int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store; |
||
341 | brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1); |
||
342 | { |
||
343 | /* Don't send AA data */ |
||
344 | fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1); |
||
345 | } |
||
346 | brw_land_fwd_jump(p, jmp); |
||
347 | fire_fb_write(inst, payload, implied_header, inst->mlen); |
||
348 | } |
||
349 | } |
||
350 | |||
351 | void |
||
352 | fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload) |
||
353 | { |
||
354 | brw_inst *insn; |
||
355 | |||
356 | insn = brw_next_insn(p, BRW_OPCODE_SEND); |
||
357 | |||
358 | brw_set_dest(p, insn, brw_null_reg()); |
||
359 | brw_set_src0(p, insn, payload); |
||
360 | brw_set_src1(p, insn, brw_imm_d(0)); |
||
361 | |||
362 | brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB); |
||
363 | brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE); |
||
364 | |||
365 | brw_inst_set_mlen(p->devinfo, insn, inst->mlen); |
||
366 | brw_inst_set_rlen(p->devinfo, insn, 0); |
||
367 | brw_inst_set_eot(p->devinfo, insn, inst->eot); |
||
368 | brw_inst_set_header_present(p->devinfo, insn, true); |
||
369 | brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset); |
||
370 | } |
||
371 | |||
372 | void |
||
373 | fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) |
||
374 | { |
||
375 | struct brw_inst *insn; |
||
376 | |||
377 | insn = brw_next_insn(p, BRW_OPCODE_SEND); |
||
378 | |||
379 | brw_set_dest(p, insn, brw_null_reg()); |
||
380 | brw_set_src0(p, insn, payload); |
||
381 | brw_set_src1(p, insn, brw_imm_d(0)); |
||
382 | |||
383 | /* Terminate a compute shader by sending a message to the thread spawner. |
||
384 | */ |
||
385 | brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER); |
||
386 | brw_inst_set_mlen(devinfo, insn, 1); |
||
387 | brw_inst_set_rlen(devinfo, insn, 0); |
||
388 | brw_inst_set_eot(devinfo, insn, inst->eot); |
||
389 | brw_inst_set_header_present(devinfo, insn, false); |
||
390 | |||
391 | brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ |
||
392 | brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ |
||
393 | |||
394 | /* Note that even though the thread has a URB resource associated with it, |
||
395 | * we set the "do not dereference URB" bit, because the URB resource is |
||
396 | * managed by the fixed-function unit, so it will free it automatically. |
||
397 | */ |
||
398 | brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ |
||
399 | |||
400 | brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); |
||
401 | } |
||
402 | |||
403 | void |
||
404 | fs_generator::generate_blorp_fb_write(fs_inst *inst) |
||
405 | { |
||
406 | brw_fb_WRITE(p, |
||
407 | 16 /* dispatch_width */, |
||
408 | brw_message_reg(inst->base_mrf), |
||
409 | brw_reg_from_fs_reg(&inst->src[0]), |
||
410 | BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, |
||
411 | inst->target, |
||
412 | inst->mlen, |
||
413 | 0, |
||
414 | true, |
||
415 | true, |
||
416 | inst->header_size != 0); |
||
417 | } |
||
418 | |||
419 | void |
||
420 | fs_generator::generate_linterp(fs_inst *inst, |
||
421 | struct brw_reg dst, struct brw_reg *src) |
||
422 | { |
||
423 | /* PLN reads: |
||
424 | * / in SIMD16 \ |
||
425 | * ----------------------------------- |
||
426 | * | src1+0 | src1+1 | src1+2 | src1+3 | |
||
427 | * |-----------------------------------| |
||
428 | * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)| |
||
429 | * ----------------------------------- |
||
430 | * |
||
431 | * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys: |
||
432 | * |
||
433 | * ----------------------------------- |
||
434 | * | src1+0 | src1+1 | src1+2 | src1+3 | |
||
435 | * |-----------------------------------| |
||
436 | * |(x0, x1)|(y0, y1)| | | in SIMD8 |
||
437 | * |-----------------------------------| |
||
438 | * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16 |
||
439 | * ----------------------------------- |
||
440 | * |
||
441 | * See also: emit_interpolation_setup_gen4(). |
||
442 | */ |
||
443 | struct brw_reg delta_x = src[0]; |
||
444 | struct brw_reg delta_y = offset(src[0], dispatch_width / 8); |
||
445 | struct brw_reg interp = src[1]; |
||
446 | |||
447 | if (devinfo->has_pln && |
||
448 | (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) { |
||
449 | brw_PLN(p, dst, interp, delta_x); |
||
450 | } else { |
||
451 | brw_LINE(p, brw_null_reg(), interp, delta_x); |
||
452 | brw_MAC(p, dst, suboffset(interp, 1), delta_y); |
||
453 | } |
||
454 | } |
||
455 | |||
456 | void |
||
457 | fs_generator::generate_math_gen6(fs_inst *inst, |
||
458 | struct brw_reg dst, |
||
459 | struct brw_reg src0, |
||
460 | struct brw_reg src1) |
||
461 | { |
||
462 | int op = brw_math_function(inst->opcode); |
||
463 | bool binop = src1.file != BRW_ARCHITECTURE_REGISTER_FILE; |
||
464 | |||
465 | if (dispatch_width == 8) { |
||
466 | gen6_math(p, dst, op, src0, src1); |
||
467 | } else if (dispatch_width == 16) { |
||
468 | brw_push_insn_state(p); |
||
469 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
470 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
471 | gen6_math(p, firsthalf(dst), op, firsthalf(src0), firsthalf(src1)); |
||
472 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
473 | gen6_math(p, sechalf(dst), op, sechalf(src0), |
||
474 | binop ? sechalf(src1) : brw_null_reg()); |
||
475 | brw_pop_insn_state(p); |
||
476 | } |
||
477 | } |
||
478 | |||
479 | void |
||
480 | fs_generator::generate_math_gen4(fs_inst *inst, |
||
481 | struct brw_reg dst, |
||
482 | struct brw_reg src) |
||
483 | { |
||
484 | int op = brw_math_function(inst->opcode); |
||
485 | |||
486 | assert(inst->mlen >= 1); |
||
487 | |||
488 | if (dispatch_width == 8) { |
||
489 | gen4_math(p, dst, |
||
490 | op, |
||
491 | inst->base_mrf, src, |
||
492 | BRW_MATH_PRECISION_FULL); |
||
493 | } else if (dispatch_width == 16) { |
||
494 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
495 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
496 | gen4_math(p, firsthalf(dst), |
||
497 | op, |
||
498 | inst->base_mrf, firsthalf(src), |
||
499 | BRW_MATH_PRECISION_FULL); |
||
500 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
501 | gen4_math(p, sechalf(dst), |
||
502 | op, |
||
503 | inst->base_mrf + 1, sechalf(src), |
||
504 | BRW_MATH_PRECISION_FULL); |
||
505 | |||
506 | brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
||
507 | } |
||
508 | } |
||
509 | |||
510 | void |
||
511 | fs_generator::generate_math_g45(fs_inst *inst, |
||
512 | struct brw_reg dst, |
||
513 | struct brw_reg src) |
||
514 | { |
||
515 | if (inst->opcode == SHADER_OPCODE_POW || |
||
516 | inst->opcode == SHADER_OPCODE_INT_QUOTIENT || |
||
517 | inst->opcode == SHADER_OPCODE_INT_REMAINDER) { |
||
518 | generate_math_gen4(inst, dst, src); |
||
519 | return; |
||
520 | } |
||
521 | |||
522 | int op = brw_math_function(inst->opcode); |
||
523 | |||
524 | assert(inst->mlen >= 1); |
||
525 | |||
526 | gen4_math(p, dst, |
||
527 | op, |
||
528 | inst->base_mrf, src, |
||
529 | BRW_MATH_PRECISION_FULL); |
||
530 | } |
||
531 | |||
532 | void |
||
533 | fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src, |
||
534 | struct brw_reg sampler_index) |
||
535 | { |
||
536 | int msg_type = -1; |
||
537 | int rlen = 4; |
||
538 | uint32_t simd_mode; |
||
539 | uint32_t return_format; |
||
540 | bool is_combined_send = inst->eot; |
||
541 | |||
542 | switch (dst.type) { |
||
543 | case BRW_REGISTER_TYPE_D: |
||
544 | return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; |
||
545 | break; |
||
546 | case BRW_REGISTER_TYPE_UD: |
||
547 | return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; |
||
548 | break; |
||
549 | default: |
||
550 | return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; |
||
551 | break; |
||
552 | } |
||
553 | |||
554 | switch (inst->exec_size) { |
||
555 | case 8: |
||
556 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; |
||
557 | break; |
||
558 | case 16: |
||
559 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; |
||
560 | break; |
||
561 | default: |
||
562 | unreachable("Invalid width for texture instruction"); |
||
563 | } |
||
564 | |||
565 | if (devinfo->gen >= 5) { |
||
566 | switch (inst->opcode) { |
||
567 | case SHADER_OPCODE_TEX: |
||
568 | if (inst->shadow_compare) { |
||
569 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; |
||
570 | } else { |
||
571 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; |
||
572 | } |
||
573 | break; |
||
574 | case FS_OPCODE_TXB: |
||
575 | if (inst->shadow_compare) { |
||
576 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; |
||
577 | } else { |
||
578 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; |
||
579 | } |
||
580 | break; |
||
581 | case SHADER_OPCODE_TXL: |
||
582 | if (inst->shadow_compare) { |
||
583 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; |
||
584 | } else { |
||
585 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; |
||
586 | } |
||
587 | break; |
||
588 | case SHADER_OPCODE_TXS: |
||
589 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; |
||
590 | break; |
||
591 | case SHADER_OPCODE_TXD: |
||
592 | if (inst->shadow_compare) { |
||
593 | /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */ |
||
594 | assert(devinfo->gen >= 8 || devinfo->is_haswell); |
||
595 | msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; |
||
596 | } else { |
||
597 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; |
||
598 | } |
||
599 | break; |
||
600 | case SHADER_OPCODE_TXF: |
||
601 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; |
||
602 | break; |
||
603 | case SHADER_OPCODE_TXF_CMS: |
||
604 | if (devinfo->gen >= 7) |
||
605 | msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; |
||
606 | else |
||
607 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; |
||
608 | break; |
||
609 | case SHADER_OPCODE_TXF_UMS: |
||
610 | assert(devinfo->gen >= 7); |
||
611 | msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS; |
||
612 | break; |
||
613 | case SHADER_OPCODE_TXF_MCS: |
||
614 | assert(devinfo->gen >= 7); |
||
615 | msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; |
||
616 | break; |
||
617 | case SHADER_OPCODE_LOD: |
||
618 | msg_type = GEN5_SAMPLER_MESSAGE_LOD; |
||
619 | break; |
||
620 | case SHADER_OPCODE_TG4: |
||
621 | if (inst->shadow_compare) { |
||
622 | assert(devinfo->gen >= 7); |
||
623 | msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; |
||
624 | } else { |
||
625 | assert(devinfo->gen >= 6); |
||
626 | msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; |
||
627 | } |
||
628 | break; |
||
629 | case SHADER_OPCODE_TG4_OFFSET: |
||
630 | assert(devinfo->gen >= 7); |
||
631 | if (inst->shadow_compare) { |
||
632 | msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; |
||
633 | } else { |
||
634 | msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; |
||
635 | } |
||
636 | break; |
||
637 | default: |
||
638 | unreachable("not reached"); |
||
639 | } |
||
640 | } else { |
||
641 | switch (inst->opcode) { |
||
642 | case SHADER_OPCODE_TEX: |
||
643 | /* Note that G45 and older determines shadow compare and dispatch width |
||
644 | * from message length for most messages. |
||
645 | */ |
||
646 | if (dispatch_width == 8) { |
||
647 | msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; |
||
648 | if (inst->shadow_compare) { |
||
649 | assert(inst->mlen == 6); |
||
650 | } else { |
||
651 | assert(inst->mlen <= 4); |
||
652 | } |
||
653 | } else { |
||
654 | if (inst->shadow_compare) { |
||
655 | msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE; |
||
656 | assert(inst->mlen == 9); |
||
657 | } else { |
||
658 | msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE; |
||
659 | assert(inst->mlen <= 7 && inst->mlen % 2 == 1); |
||
660 | } |
||
661 | } |
||
662 | break; |
||
663 | case FS_OPCODE_TXB: |
||
664 | if (inst->shadow_compare) { |
||
665 | assert(dispatch_width == 8); |
||
666 | assert(inst->mlen == 6); |
||
667 | msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; |
||
668 | } else { |
||
669 | assert(inst->mlen == 9); |
||
670 | msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; |
||
671 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; |
||
672 | } |
||
673 | break; |
||
674 | case SHADER_OPCODE_TXL: |
||
675 | if (inst->shadow_compare) { |
||
676 | assert(dispatch_width == 8); |
||
677 | assert(inst->mlen == 6); |
||
678 | msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; |
||
679 | } else { |
||
680 | assert(inst->mlen == 9); |
||
681 | msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; |
||
682 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; |
||
683 | } |
||
684 | break; |
||
685 | case SHADER_OPCODE_TXD: |
||
686 | /* There is no sample_d_c message; comparisons are done manually */ |
||
687 | assert(dispatch_width == 8); |
||
688 | assert(inst->mlen == 7 || inst->mlen == 10); |
||
689 | msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS; |
||
690 | break; |
||
691 | case SHADER_OPCODE_TXF: |
||
692 | assert(inst->mlen <= 9 && inst->mlen % 2 == 1); |
||
693 | msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; |
||
694 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; |
||
695 | break; |
||
696 | case SHADER_OPCODE_TXS: |
||
697 | assert(inst->mlen == 3); |
||
698 | msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO; |
||
699 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; |
||
700 | break; |
||
701 | default: |
||
702 | unreachable("not reached"); |
||
703 | } |
||
704 | } |
||
705 | assert(msg_type != -1); |
||
706 | |||
707 | if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { |
||
708 | rlen = 8; |
||
709 | dst = vec16(dst); |
||
710 | } |
||
711 | |||
712 | if (is_combined_send) { |
||
713 | assert(devinfo->gen >= 9 || devinfo->is_cherryview); |
||
714 | rlen = 0; |
||
715 | } |
||
716 | |||
717 | assert(devinfo->gen < 7 || inst->header_size == 0 || |
||
718 | src.file == BRW_GENERAL_REGISTER_FILE); |
||
719 | |||
720 | assert(sampler_index.type == BRW_REGISTER_TYPE_UD); |
||
721 | |||
722 | /* Load the message header if present. If there's a texture offset, |
||
723 | * we need to set it up explicitly and load the offset bitfield. |
||
724 | * Otherwise, we can use an implied move from g0 to the first message reg. |
||
725 | */ |
||
726 | if (inst->header_size != 0) { |
||
727 | if (devinfo->gen < 6 && !inst->offset) { |
||
728 | /* Set up an implied move from g0 to the MRF. */ |
||
729 | src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); |
||
730 | } else { |
||
731 | struct brw_reg header_reg; |
||
732 | |||
733 | if (devinfo->gen >= 7) { |
||
734 | header_reg = src; |
||
735 | } else { |
||
736 | assert(inst->base_mrf != -1); |
||
737 | header_reg = brw_message_reg(inst->base_mrf); |
||
738 | } |
||
739 | |||
740 | brw_push_insn_state(p); |
||
741 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
742 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
743 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
744 | /* Explicitly set up the message header by copying g0 to the MRF. */ |
||
745 | brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); |
||
746 | |||
747 | if (inst->offset) { |
||
748 | /* Set the offset bits in DWord 2. */ |
||
749 | brw_MOV(p, get_element_ud(header_reg, 2), |
||
750 | brw_imm_ud(inst->offset)); |
||
751 | } |
||
752 | |||
753 | brw_adjust_sampler_state_pointer(p, header_reg, sampler_index); |
||
754 | brw_pop_insn_state(p); |
||
755 | } |
||
756 | } |
||
757 | |||
758 | uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || |
||
759 | inst->opcode == SHADER_OPCODE_TG4_OFFSET) |
||
760 | ? prog_data->binding_table.gather_texture_start |
||
761 | : prog_data->binding_table.texture_start; |
||
762 | |||
763 | if (sampler_index.file == BRW_IMMEDIATE_VALUE) { |
||
764 | uint32_t sampler = sampler_index.dw1.ud; |
||
765 | |||
766 | brw_SAMPLE(p, |
||
767 | retype(dst, BRW_REGISTER_TYPE_UW), |
||
768 | inst->base_mrf, |
||
769 | src, |
||
770 | sampler + base_binding_table_index, |
||
771 | sampler % 16, |
||
772 | msg_type, |
||
773 | rlen, |
||
774 | inst->mlen, |
||
775 | inst->header_size != 0, |
||
776 | simd_mode, |
||
777 | return_format); |
||
778 | |||
779 | brw_mark_surface_used(prog_data, sampler + base_binding_table_index); |
||
780 | } else { |
||
781 | /* Non-const sampler index */ |
||
782 | /* Note: this clobbers `dst` as a temporary before emitting the send */ |
||
783 | |||
784 | struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); |
||
785 | struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD)); |
||
786 | |||
787 | struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); |
||
788 | |||
789 | brw_push_insn_state(p); |
||
790 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
791 | brw_set_default_access_mode(p, BRW_ALIGN_1); |
||
792 | |||
793 | /* Some care required: `sampler` and `temp` may alias: |
||
794 | * addr = sampler & 0xff |
||
795 | * temp = (sampler << 8) & 0xf00 |
||
796 | * addr = addr | temp |
||
797 | */ |
||
798 | brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index)); |
||
799 | brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u)); |
||
800 | brw_AND(p, temp, temp, brw_imm_ud(0x0f00)); |
||
801 | brw_AND(p, addr, addr, brw_imm_ud(0x0ff)); |
||
802 | brw_OR(p, addr, addr, temp); |
||
803 | |||
804 | brw_pop_insn_state(p); |
||
805 | |||
806 | /* dst = send(offset, a0.0 | |
||
807 | brw_inst *insn = brw_send_indirect_message( |
||
808 | p, BRW_SFID_SAMPLER, dst, src, addr); |
||
809 | brw_set_sampler_message(p, insn, |
||
810 | |||
811 | |||
812 | msg_type, |
||
813 | rlen, |
||
814 | inst->mlen /* mlen */, |
||
815 | inst->header_size != 0 /* header */, |
||
816 | simd_mode, |
||
817 | return_format); |
||
818 | |||
819 | /* visitor knows more than we do about the surface limit required, |
||
820 | * so has already done marking. |
||
821 | */ |
||
822 | } |
||
823 | |||
824 | if (is_combined_send) { |
||
825 | brw_inst_set_eot(p->devinfo, brw_last_inst, true); |
||
826 | brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC); |
||
827 | } |
||
828 | } |
||
829 | |||
830 | |||
831 | /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input |
||
832 | * looking like: |
||
833 | * |
||
834 | * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br |
||
835 | * |
||
836 | * Ideally, we want to produce: |
||
837 | * |
||
838 | * DDX DDY |
||
839 | * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) |
||
840 | * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) |
||
841 | * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) |
||
842 | * (ss0.br - ss0.bl) (ss0.tr - ss0.br) |
||
843 | * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) |
||
844 | * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) |
||
845 | * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) |
||
846 | * (ss1.br - ss1.bl) (ss1.tr - ss1.br) |
||
847 | * |
||
848 | * and add another set of two more subspans if in 16-pixel dispatch mode. |
||
849 | * |
||
850 | * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result |
||
851 | * for each pair, and vertstride = 2 jumps us 2 elements after processing a |
||
852 | * pair. But the ideal approximation may impose a huge performance cost on |
||
853 | * sample_d. On at least Haswell, sample_d instruction does some |
||
854 | * optimizations if the same LOD is used for all pixels in the subspan. |
||
855 | * |
||
856 | * For DDY, we need to use ALIGN16 mode since it's capable of doing the |
||
857 | * appropriate swizzling. |
||
858 | */ |
||
859 | void |
||
860 | fs_generator::generate_ddx(enum opcode opcode, |
||
861 | struct brw_reg dst, struct brw_reg src) |
||
862 | { |
||
863 | unsigned vstride, width; |
||
864 | |||
865 | if (opcode == FS_OPCODE_DDX_FINE) { |
||
866 | /* produce accurate derivatives */ |
||
867 | vstride = BRW_VERTICAL_STRIDE_2; |
||
868 | width = BRW_WIDTH_2; |
||
869 | } else { |
||
870 | /* replicate the derivative at the top-left pixel to other pixels */ |
||
871 | vstride = BRW_VERTICAL_STRIDE_4; |
||
872 | width = BRW_WIDTH_4; |
||
873 | } |
||
874 | |||
875 | struct brw_reg src0 = brw_reg(src.file, src.nr, 1, |
||
876 | src.negate, src.abs, |
||
877 | BRW_REGISTER_TYPE_F, |
||
878 | vstride, |
||
879 | width, |
||
880 | BRW_HORIZONTAL_STRIDE_0, |
||
881 | BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); |
||
882 | struct brw_reg src1 = brw_reg(src.file, src.nr, 0, |
||
883 | src.negate, src.abs, |
||
884 | BRW_REGISTER_TYPE_F, |
||
885 | vstride, |
||
886 | width, |
||
887 | BRW_HORIZONTAL_STRIDE_0, |
||
888 | BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); |
||
889 | brw_ADD(p, dst, src0, negate(src1)); |
||
890 | } |
||
891 | |||
892 | /* The negate_value boolean is used to negate the derivative computation for |
||
893 | * FBOs, since they place the origin at the upper left instead of the lower |
||
894 | * left. |
||
895 | */ |
||
896 | void |
||
897 | fs_generator::generate_ddy(enum opcode opcode, |
||
898 | struct brw_reg dst, struct brw_reg src, |
||
899 | bool negate_value) |
||
900 | { |
||
901 | if (opcode == FS_OPCODE_DDY_FINE) { |
||
902 | /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register |
||
903 | * Region Restrictions): |
||
904 | * |
||
905 | * In Align16 access mode, SIMD16 is not allowed for DW operations |
||
906 | * and SIMD8 is not allowed for DF operations. |
||
907 | * |
||
908 | * In this context, "DW operations" means "operations acting on 32-bit |
||
909 | * values", so it includes operations on floats. |
||
910 | * |
||
911 | * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3 |
||
912 | * (Instruction Compression -> Rules and Restrictions): |
||
913 | * |
||
914 | * A compressed instruction must be in Align1 access mode. Align16 |
||
915 | * mode instructions cannot be compressed. |
||
916 | * |
||
917 | * Similar text exists in the g45 PRM. |
||
918 | * |
||
919 | * On these platforms, if we're building a SIMD16 shader, we need to |
||
920 | * manually unroll to a pair of SIMD8 instructions. |
||
921 | */ |
||
922 | bool unroll_to_simd8 = |
||
923 | (dispatch_width == 16 && |
||
924 | (devinfo->gen == 4 || (devinfo->gen == 7 && !devinfo->is_haswell))); |
||
925 | |||
926 | /* produce accurate derivatives */ |
||
927 | struct brw_reg src0 = brw_reg(src.file, src.nr, 0, |
||
928 | src.negate, src.abs, |
||
929 | BRW_REGISTER_TYPE_F, |
||
930 | BRW_VERTICAL_STRIDE_4, |
||
931 | BRW_WIDTH_4, |
||
932 | BRW_HORIZONTAL_STRIDE_1, |
||
933 | BRW_SWIZZLE_XYXY, WRITEMASK_XYZW); |
||
934 | struct brw_reg src1 = brw_reg(src.file, src.nr, 0, |
||
935 | src.negate, src.abs, |
||
936 | BRW_REGISTER_TYPE_F, |
||
937 | BRW_VERTICAL_STRIDE_4, |
||
938 | BRW_WIDTH_4, |
||
939 | BRW_HORIZONTAL_STRIDE_1, |
||
940 | BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW); |
||
941 | brw_push_insn_state(p); |
||
942 | brw_set_default_access_mode(p, BRW_ALIGN_16); |
||
943 | if (unroll_to_simd8) { |
||
944 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
945 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
946 | if (negate_value) { |
||
947 | brw_ADD(p, firsthalf(dst), firsthalf(src1), negate(firsthalf(src0))); |
||
948 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
949 | brw_ADD(p, sechalf(dst), sechalf(src1), negate(sechalf(src0))); |
||
950 | } else { |
||
951 | brw_ADD(p, firsthalf(dst), firsthalf(src0), negate(firsthalf(src1))); |
||
952 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
953 | brw_ADD(p, sechalf(dst), sechalf(src0), negate(sechalf(src1))); |
||
954 | } |
||
955 | } else { |
||
956 | if (negate_value) |
||
957 | brw_ADD(p, dst, src1, negate(src0)); |
||
958 | else |
||
959 | brw_ADD(p, dst, src0, negate(src1)); |
||
960 | } |
||
961 | brw_pop_insn_state(p); |
||
962 | } else { |
||
963 | /* replicate the derivative at the top-left pixel to other pixels */ |
||
964 | struct brw_reg src0 = brw_reg(src.file, src.nr, 0, |
||
965 | src.negate, src.abs, |
||
966 | BRW_REGISTER_TYPE_F, |
||
967 | BRW_VERTICAL_STRIDE_4, |
||
968 | BRW_WIDTH_4, |
||
969 | BRW_HORIZONTAL_STRIDE_0, |
||
970 | BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); |
||
971 | struct brw_reg src1 = brw_reg(src.file, src.nr, 2, |
||
972 | src.negate, src.abs, |
||
973 | BRW_REGISTER_TYPE_F, |
||
974 | BRW_VERTICAL_STRIDE_4, |
||
975 | BRW_WIDTH_4, |
||
976 | BRW_HORIZONTAL_STRIDE_0, |
||
977 | BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); |
||
978 | if (negate_value) |
||
979 | brw_ADD(p, dst, src1, negate(src0)); |
||
980 | else |
||
981 | brw_ADD(p, dst, src0, negate(src1)); |
||
982 | } |
||
983 | } |
||
984 | |||
985 | void |
||
986 | fs_generator::generate_discard_jump(fs_inst *inst) |
||
987 | { |
||
988 | assert(devinfo->gen >= 6); |
||
989 | |||
990 | /* This HALT will be patched up at FB write time to point UIP at the end of |
||
991 | * the program, and at brw_uip_jip() JIP will be set to the end of the |
||
992 | * current block (or the program). |
||
993 | */ |
||
994 | this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); |
||
995 | |||
996 | brw_push_insn_state(p); |
||
997 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
998 | gen6_HALT(p); |
||
999 | brw_pop_insn_state(p); |
||
1000 | } |
||
1001 | |||
1002 | void |
||
1003 | fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) |
||
1004 | { |
||
1005 | assert(inst->mlen != 0); |
||
1006 | |||
1007 | brw_MOV(p, |
||
1008 | brw_uvec_mrf(inst->exec_size, (inst->base_mrf + 1), 0), |
||
1009 | retype(src, BRW_REGISTER_TYPE_UD)); |
||
1010 | brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), |
||
1011 | inst->exec_size / 8, inst->offset); |
||
1012 | } |
||
1013 | |||
1014 | void |
||
1015 | fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst) |
||
1016 | { |
||
1017 | assert(inst->mlen != 0); |
||
1018 | |||
1019 | brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), |
||
1020 | inst->exec_size / 8, inst->offset); |
||
1021 | } |
||
1022 | |||
1023 | void |
||
1024 | fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst) |
||
1025 | { |
||
1026 | gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset); |
||
1027 | } |
||
1028 | |||
1029 | void |
||
1030 | fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, |
||
1031 | struct brw_reg dst, |
||
1032 | struct brw_reg index, |
||
1033 | struct brw_reg offset) |
||
1034 | { |
||
1035 | assert(inst->mlen != 0); |
||
1036 | |||
1037 | assert(index.file == BRW_IMMEDIATE_VALUE && |
||
1038 | index.type == BRW_REGISTER_TYPE_UD); |
||
1039 | uint32_t surf_index = index.dw1.ud; |
||
1040 | |||
1041 | assert(offset.file == BRW_IMMEDIATE_VALUE && |
||
1042 | offset.type == BRW_REGISTER_TYPE_UD); |
||
1043 | uint32_t read_offset = offset.dw1.ud; |
||
1044 | |||
1045 | brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), |
||
1046 | read_offset, surf_index); |
||
1047 | |||
1048 | brw_mark_surface_used(prog_data, surf_index); |
||
1049 | } |
||
1050 | |||
1051 | void |
||
1052 | fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, |
||
1053 | struct brw_reg dst, |
||
1054 | struct brw_reg index, |
||
1055 | struct brw_reg offset) |
||
1056 | { |
||
1057 | assert(inst->mlen == 0); |
||
1058 | assert(index.type == BRW_REGISTER_TYPE_UD); |
||
1059 | |||
1060 | assert(offset.file == BRW_GENERAL_REGISTER_FILE); |
||
1061 | /* Reference just the dword we need, to avoid angering validate_reg(). */ |
||
1062 | offset = brw_vec1_grf(offset.nr, 0); |
||
1063 | |||
1064 | /* We use the SIMD4x2 mode because we want to end up with 4 components in |
||
1065 | * the destination loaded consecutively from the same offset (which appears |
||
1066 | * in the first component, and the rest are ignored). |
||
1067 | */ |
||
1068 | dst.width = BRW_WIDTH_4; |
||
1069 | |||
1070 | struct brw_reg src = offset; |
||
1071 | bool header_present = false; |
||
1072 | int mlen = 1; |
||
1073 | |||
1074 | if (devinfo->gen >= 9) { |
||
1075 | /* Skylake requires a message header in order to use SIMD4x2 mode. */ |
||
1076 | src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD); |
||
1077 | mlen = 2; |
||
1078 | header_present = true; |
||
1079 | |||
1080 | brw_push_insn_state(p); |
||
1081 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
1082 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
1083 | brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); |
||
1084 | brw_set_default_access_mode(p, BRW_ALIGN_1); |
||
1085 | |||
1086 | brw_MOV(p, get_element_ud(src, 2), |
||
1087 | brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2)); |
||
1088 | brw_pop_insn_state(p); |
||
1089 | } |
||
1090 | |||
1091 | if (index.file == BRW_IMMEDIATE_VALUE) { |
||
1092 | |||
1093 | uint32_t surf_index = index.dw1.ud; |
||
1094 | |||
1095 | brw_push_insn_state(p); |
||
1096 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1097 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
1098 | brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); |
||
1099 | brw_pop_insn_state(p); |
||
1100 | |||
1101 | brw_set_dest(p, send, dst); |
||
1102 | brw_set_src0(p, send, src); |
||
1103 | brw_set_sampler_message(p, send, |
||
1104 | surf_index, |
||
1105 | 0, /* LD message ignores sampler unit */ |
||
1106 | GEN5_SAMPLER_MESSAGE_SAMPLE_LD, |
||
1107 | 1, /* rlen */ |
||
1108 | mlen, |
||
1109 | header_present, |
||
1110 | BRW_SAMPLER_SIMD_MODE_SIMD4X2, |
||
1111 | 0); |
||
1112 | |||
1113 | brw_mark_surface_used(prog_data, surf_index); |
||
1114 | |||
1115 | } else { |
||
1116 | |||
1117 | struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); |
||
1118 | |||
1119 | brw_push_insn_state(p); |
||
1120 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
1121 | brw_set_default_access_mode(p, BRW_ALIGN_1); |
||
1122 | |||
1123 | /* a0.0 = surf_index & 0xff */ |
||
1124 | brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); |
||
1125 | brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); |
||
1126 | brw_set_dest(p, insn_and, addr); |
||
1127 | brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); |
||
1128 | brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); |
||
1129 | |||
1130 | /* dst = send(payload, a0.0 | |
||
1131 | brw_inst *insn = brw_send_indirect_message( |
||
1132 | p, BRW_SFID_SAMPLER, dst, src, addr); |
||
1133 | brw_set_sampler_message(p, insn, |
||
1134 | 0, |
||
1135 | 0, /* LD message ignores sampler unit */ |
||
1136 | GEN5_SAMPLER_MESSAGE_SAMPLE_LD, |
||
1137 | 1, /* rlen */ |
||
1138 | mlen, |
||
1139 | header_present, |
||
1140 | BRW_SAMPLER_SIMD_MODE_SIMD4X2, |
||
1141 | 0); |
||
1142 | |||
1143 | brw_pop_insn_state(p); |
||
1144 | |||
1145 | /* visitor knows more than we do about the surface limit required, |
||
1146 | * so has already done marking. |
||
1147 | */ |
||
1148 | |||
1149 | } |
||
1150 | } |
||
1151 | |||
1152 | void |
||
1153 | fs_generator::generate_varying_pull_constant_load(fs_inst *inst, |
||
1154 | struct brw_reg dst, |
||
1155 | struct brw_reg index, |
||
1156 | struct brw_reg offset) |
||
1157 | { |
||
1158 | assert(devinfo->gen < 7); /* Should use the gen7 variant. */ |
||
1159 | assert(inst->header_size != 0); |
||
1160 | assert(inst->mlen); |
||
1161 | |||
1162 | assert(index.file == BRW_IMMEDIATE_VALUE && |
||
1163 | index.type == BRW_REGISTER_TYPE_UD); |
||
1164 | uint32_t surf_index = index.dw1.ud; |
||
1165 | |||
1166 | uint32_t simd_mode, rlen, msg_type; |
||
1167 | if (dispatch_width == 16) { |
||
1168 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; |
||
1169 | rlen = 8; |
||
1170 | } else { |
||
1171 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; |
||
1172 | rlen = 4; |
||
1173 | } |
||
1174 | |||
1175 | if (devinfo->gen >= 5) |
||
1176 | msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; |
||
1177 | else { |
||
1178 | /* We always use the SIMD16 message so that we only have to load U, and |
||
1179 | * not V or R. |
||
1180 | */ |
||
1181 | msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; |
||
1182 | assert(inst->mlen == 3); |
||
1183 | assert(inst->regs_written == 8); |
||
1184 | rlen = 8; |
||
1185 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; |
||
1186 | } |
||
1187 | |||
1188 | struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1), |
||
1189 | BRW_REGISTER_TYPE_D); |
||
1190 | brw_MOV(p, offset_mrf, offset); |
||
1191 | |||
1192 | struct brw_reg header = brw_vec8_grf(0, 0); |
||
1193 | gen6_resolve_implied_move(p, &header, inst->base_mrf); |
||
1194 | |||
1195 | brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); |
||
1196 | brw_inst_set_qtr_control(p->devinfo, send, BRW_COMPRESSION_NONE); |
||
1197 | brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); |
||
1198 | brw_set_src0(p, send, header); |
||
1199 | if (devinfo->gen < 6) |
||
1200 | brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf); |
||
1201 | |||
1202 | /* Our surface is set up as floats, regardless of what actual data is |
||
1203 | * stored in it. |
||
1204 | */ |
||
1205 | uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; |
||
1206 | brw_set_sampler_message(p, send, |
||
1207 | surf_index, |
||
1208 | 0, /* sampler (unused) */ |
||
1209 | msg_type, |
||
1210 | rlen, |
||
1211 | inst->mlen, |
||
1212 | inst->header_size != 0, |
||
1213 | simd_mode, |
||
1214 | return_format); |
||
1215 | |||
1216 | brw_mark_surface_used(prog_data, surf_index); |
||
1217 | } |
||
1218 | |||
1219 | void |
||
1220 | fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, |
||
1221 | struct brw_reg dst, |
||
1222 | struct brw_reg index, |
||
1223 | struct brw_reg offset) |
||
1224 | { |
||
1225 | assert(devinfo->gen >= 7); |
||
1226 | /* Varying-offset pull constant loads are treated as a normal expression on |
||
1227 | * gen7, so the fact that it's a send message is hidden at the IR level. |
||
1228 | */ |
||
1229 | assert(inst->header_size == 0); |
||
1230 | assert(!inst->mlen); |
||
1231 | assert(index.type == BRW_REGISTER_TYPE_UD); |
||
1232 | |||
1233 | uint32_t simd_mode, rlen, mlen; |
||
1234 | if (dispatch_width == 16) { |
||
1235 | mlen = 2; |
||
1236 | rlen = 8; |
||
1237 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; |
||
1238 | } else { |
||
1239 | mlen = 1; |
||
1240 | rlen = 4; |
||
1241 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; |
||
1242 | } |
||
1243 | |||
1244 | if (index.file == BRW_IMMEDIATE_VALUE) { |
||
1245 | |||
1246 | uint32_t surf_index = index.dw1.ud; |
||
1247 | |||
1248 | brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); |
||
1249 | brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); |
||
1250 | brw_set_src0(p, send, offset); |
||
1251 | brw_set_sampler_message(p, send, |
||
1252 | surf_index, |
||
1253 | 0, /* LD message ignores sampler unit */ |
||
1254 | GEN5_SAMPLER_MESSAGE_SAMPLE_LD, |
||
1255 | rlen, |
||
1256 | mlen, |
||
1257 | false, /* no header */ |
||
1258 | simd_mode, |
||
1259 | 0); |
||
1260 | |||
1261 | brw_mark_surface_used(prog_data, surf_index); |
||
1262 | |||
1263 | } else { |
||
1264 | |||
1265 | struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); |
||
1266 | |||
1267 | brw_push_insn_state(p); |
||
1268 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
1269 | brw_set_default_access_mode(p, BRW_ALIGN_1); |
||
1270 | |||
1271 | /* a0.0 = surf_index & 0xff */ |
||
1272 | brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); |
||
1273 | brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); |
||
1274 | brw_set_dest(p, insn_and, addr); |
||
1275 | brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); |
||
1276 | brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); |
||
1277 | |||
1278 | brw_pop_insn_state(p); |
||
1279 | |||
1280 | /* dst = send(offset, a0.0 | |
||
1281 | brw_inst *insn = brw_send_indirect_message( |
||
1282 | p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW), |
||
1283 | offset, addr); |
||
1284 | brw_set_sampler_message(p, insn, |
||
1285 | |||
1286 | |||
1287 | GEN5_SAMPLER_MESSAGE_SAMPLE_LD, |
||
1288 | rlen /* rlen */, |
||
1289 | mlen /* mlen */, |
||
1290 | false /* header */, |
||
1291 | simd_mode, |
||
1292 | 0); |
||
1293 | |||
1294 | /* visitor knows more than we do about the surface limit required, |
||
1295 | * so has already done marking. |
||
1296 | */ |
||
1297 | } |
||
1298 | } |
||
1299 | |||
1300 | /** |
||
1301 | * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred |
||
1302 | * into the flags register (f0.0). |
||
1303 | * |
||
1304 | * Used only on Gen6 and above. |
||
1305 | */ |
||
1306 | void |
||
1307 | fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst) |
||
1308 | { |
||
1309 | struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg); |
||
1310 | struct brw_reg dispatch_mask; |
||
1311 | |||
1312 | if (devinfo->gen >= 6) |
||
1313 | dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); |
||
1314 | else |
||
1315 | dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); |
||
1316 | |||
1317 | brw_push_insn_state(p); |
||
1318 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
1319 | brw_MOV(p, flags, dispatch_mask); |
||
1320 | brw_pop_insn_state(p); |
||
1321 | } |
||
1322 | |||
1323 | void |
||
1324 | fs_generator::generate_pixel_interpolator_query(fs_inst *inst, |
||
1325 | struct brw_reg dst, |
||
1326 | struct brw_reg src, |
||
1327 | struct brw_reg msg_data, |
||
1328 | unsigned msg_type) |
||
1329 | { |
||
1330 | assert(msg_data.file == BRW_IMMEDIATE_VALUE && |
||
1331 | msg_data.type == BRW_REGISTER_TYPE_UD); |
||
1332 | |||
1333 | brw_pixel_interpolator_query(p, |
||
1334 | retype(dst, BRW_REGISTER_TYPE_UW), |
||
1335 | src, |
||
1336 | inst->pi_noperspective, |
||
1337 | msg_type, |
||
1338 | msg_data.dw1.ud, |
||
1339 | inst->mlen, |
||
1340 | inst->regs_written); |
||
1341 | } |
||
1342 | |||
1343 | |||
1344 | /** |
||
1345 | * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant |
||
1346 | * sampler LD messages. |
||
1347 | * |
||
1348 | * We don't want to bake it into the send message's code generation because |
||
1349 | * that means we don't get a chance to schedule the instructions. |
||
1350 | */ |
||
1351 | void |
||
1352 | fs_generator::generate_set_simd4x2_offset(fs_inst *inst, |
||
1353 | struct brw_reg dst, |
||
1354 | struct brw_reg value) |
||
1355 | { |
||
1356 | assert(value.file == BRW_IMMEDIATE_VALUE); |
||
1357 | |||
1358 | brw_push_insn_state(p); |
||
1359 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
1360 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1361 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
1362 | brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value); |
||
1363 | brw_pop_insn_state(p); |
||
1364 | } |
||
1365 | |||
1366 | /* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0 |
||
1367 | * (when mask is passed as a uniform) of register mask before moving it |
||
1368 | * to register dst. |
||
1369 | */ |
||
1370 | void |
||
1371 | fs_generator::generate_set_omask(fs_inst *inst, |
||
1372 | struct brw_reg dst, |
||
1373 | struct brw_reg mask) |
||
1374 | { |
||
1375 | bool stride_8_8_1 = |
||
1376 | (mask.vstride == BRW_VERTICAL_STRIDE_8 && |
||
1377 | mask.width == BRW_WIDTH_8 && |
||
1378 | mask.hstride == BRW_HORIZONTAL_STRIDE_1); |
||
1379 | |||
1380 | bool stride_0_1_0 = has_scalar_region(mask); |
||
1381 | |||
1382 | assert(stride_8_8_1 || stride_0_1_0); |
||
1383 | assert(dst.type == BRW_REGISTER_TYPE_UW); |
||
1384 | |||
1385 | brw_push_insn_state(p); |
||
1386 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1387 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
1388 | |||
1389 | if (stride_8_8_1) { |
||
1390 | brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type)); |
||
1391 | } else if (stride_0_1_0) { |
||
1392 | brw_MOV(p, dst, retype(mask, dst.type)); |
||
1393 | } |
||
1394 | brw_pop_insn_state(p); |
||
1395 | } |
||
1396 | |||
1397 | /* Sets vstride=1, width=4, hstride=0 of register src1 during |
||
1398 | * the ADD instruction. |
||
1399 | */ |
||
1400 | void |
||
1401 | fs_generator::generate_set_sample_id(fs_inst *inst, |
||
1402 | struct brw_reg dst, |
||
1403 | struct brw_reg src0, |
||
1404 | struct brw_reg src1) |
||
1405 | { |
||
1406 | assert(dst.type == BRW_REGISTER_TYPE_D || |
||
1407 | dst.type == BRW_REGISTER_TYPE_UD); |
||
1408 | assert(src0.type == BRW_REGISTER_TYPE_D || |
||
1409 | src0.type == BRW_REGISTER_TYPE_UD); |
||
1410 | |||
1411 | brw_push_insn_state(p); |
||
1412 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
1413 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1414 | brw_set_default_mask_control(p, BRW_MASK_DISABLE); |
||
1415 | struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW); |
||
1416 | if (dispatch_width == 8) { |
||
1417 | brw_ADD(p, dst, src0, reg); |
||
1418 | } else if (dispatch_width == 16) { |
||
1419 | brw_ADD(p, firsthalf(dst), firsthalf(src0), reg); |
||
1420 | brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2)); |
||
1421 | } |
||
1422 | brw_pop_insn_state(p); |
||
1423 | } |
||
1424 | |||
1425 | void |
||
1426 | fs_generator::generate_pack_half_2x16_split(fs_inst *inst, |
||
1427 | struct brw_reg dst, |
||
1428 | struct brw_reg x, |
||
1429 | struct brw_reg y) |
||
1430 | { |
||
1431 | assert(devinfo->gen >= 7); |
||
1432 | assert(dst.type == BRW_REGISTER_TYPE_UD); |
||
1433 | assert(x.type == BRW_REGISTER_TYPE_F); |
||
1434 | assert(y.type == BRW_REGISTER_TYPE_F); |
||
1435 | |||
1436 | /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: |
||
1437 | * |
||
1438 | * Because this instruction does not have a 16-bit floating-point type, |
||
1439 | * the destination data type must be Word (W). |
||
1440 | * |
||
1441 | * The destination must be DWord-aligned and specify a horizontal stride |
||
1442 | * (HorzStride) of 2. The 16-bit result is stored in the lower word of |
||
1443 | * each destination channel and the upper word is not modified. |
||
1444 | */ |
||
1445 | struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2); |
||
1446 | |||
1447 | /* Give each 32-bit channel of dst the form below, where "." means |
||
1448 | * unchanged. |
||
1449 | * 0x....hhhh |
||
1450 | */ |
||
1451 | brw_F32TO16(p, dst_w, y); |
||
1452 | |||
1453 | /* Now the form: |
||
1454 | * 0xhhhh0000 |
||
1455 | */ |
||
1456 | brw_SHL(p, dst, dst, brw_imm_ud(16u)); |
||
1457 | |||
1458 | /* And, finally the form of packHalf2x16's output: |
||
1459 | * 0xhhhhllll |
||
1460 | */ |
||
1461 | brw_F32TO16(p, dst_w, x); |
||
1462 | } |
||
1463 | |||
1464 | void |
||
1465 | fs_generator::generate_unpack_half_2x16_split(fs_inst *inst, |
||
1466 | struct brw_reg dst, |
||
1467 | struct brw_reg src) |
||
1468 | { |
||
1469 | assert(devinfo->gen >= 7); |
||
1470 | assert(dst.type == BRW_REGISTER_TYPE_F); |
||
1471 | assert(src.type == BRW_REGISTER_TYPE_UD); |
||
1472 | |||
1473 | /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: |
||
1474 | * |
||
1475 | * Because this instruction does not have a 16-bit floating-point type, |
||
1476 | * the source data type must be Word (W). The destination type must be |
||
1477 | * F (Float). |
||
1478 | */ |
||
1479 | struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2); |
||
1480 | |||
1481 | /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll. |
||
1482 | * For the Y case, we wish to access only the upper word; therefore |
||
1483 | * a 16-bit subregister offset is needed. |
||
1484 | */ |
||
1485 | assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X || |
||
1486 | inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y); |
||
1487 | if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y) |
||
1488 | src_w.subnr += 2; |
||
1489 | |||
1490 | brw_F16TO32(p, dst, src_w); |
||
1491 | } |
||
1492 | |||
1493 | void |
||
1494 | fs_generator::generate_shader_time_add(fs_inst *inst, |
||
1495 | struct brw_reg payload, |
||
1496 | struct brw_reg offset, |
||
1497 | struct brw_reg value) |
||
1498 | { |
||
1499 | assert(devinfo->gen >= 7); |
||
1500 | brw_push_insn_state(p); |
||
1501 | brw_set_default_mask_control(p, true); |
||
1502 | |||
1503 | assert(payload.file == BRW_GENERAL_REGISTER_FILE); |
||
1504 | struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0), |
||
1505 | offset.type); |
||
1506 | struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0), |
||
1507 | value.type); |
||
1508 | |||
1509 | assert(offset.file == BRW_IMMEDIATE_VALUE); |
||
1510 | if (value.file == BRW_GENERAL_REGISTER_FILE) { |
||
1511 | value.width = BRW_WIDTH_1; |
||
1512 | value.hstride = BRW_HORIZONTAL_STRIDE_0; |
||
1513 | value.vstride = BRW_VERTICAL_STRIDE_0; |
||
1514 | } else { |
||
1515 | assert(value.file == BRW_IMMEDIATE_VALUE); |
||
1516 | } |
||
1517 | |||
1518 | /* Trying to deal with setup of the params from the IR is crazy in the FS8 |
||
1519 | * case, and we don't really care about squeezing every bit of performance |
||
1520 | * out of this path, so we just emit the MOVs from here. |
||
1521 | */ |
||
1522 | brw_MOV(p, payload_offset, offset); |
||
1523 | brw_MOV(p, payload_value, value); |
||
1524 | brw_shader_time_add(p, payload, |
||
1525 | prog_data->binding_table.shader_time_start); |
||
1526 | brw_pop_insn_state(p); |
||
1527 | |||
1528 | brw_mark_surface_used(prog_data, |
||
1529 | prog_data->binding_table.shader_time_start); |
||
1530 | } |
||
1531 | |||
1532 | void |
||
1533 | fs_generator::enable_debug(const char *shader_name) |
||
1534 | { |
||
1535 | debug_flag = true; |
||
1536 | this->shader_name = shader_name; |
||
1537 | } |
||
1538 | |||
1539 | int |
||
1540 | fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) |
||
1541 | { |
||
1542 | /* align to 64 byte boundary. */ |
||
1543 | while (p->next_insn_offset % 64) |
||
1544 | brw_NOP(p); |
||
1545 | |||
1546 | this->dispatch_width = dispatch_width; |
||
1547 | if (dispatch_width == 16) |
||
1548 | brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
||
1549 | |||
1550 | int start_offset = p->next_insn_offset; |
||
1551 | int spill_count = 0, fill_count = 0; |
||
1552 | int loop_count = 0; |
||
1553 | |||
1554 | struct annotation_info annotation; |
||
1555 | memset(&annotation, 0, sizeof(annotation)); |
||
1556 | |||
1557 | foreach_block_and_inst (block, fs_inst, inst, cfg) { |
||
1558 | struct brw_reg src[3], dst; |
||
1559 | unsigned int last_insn_offset = p->next_insn_offset; |
||
1560 | bool multiple_instructions_emitted = false; |
||
1561 | |||
1562 | if (unlikely(debug_flag)) |
||
1563 | annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset); |
||
1564 | |||
1565 | for (unsigned int i = 0; i < inst->sources; i++) { |
||
1566 | src[i] = brw_reg_from_fs_reg(&inst->src[i]); |
||
1567 | |||
1568 | /* The accumulator result appears to get used for the |
||
1569 | * conditional modifier generation. When negating a UD |
||
1570 | * value, there is a 33rd bit generated for the sign in the |
||
1571 | * accumulator value, so now you can't check, for example, |
||
1572 | * equality with a 32-bit value. See piglit fs-op-neg-uvec4. |
||
1573 | */ |
||
1574 | assert(!inst->conditional_mod || |
||
1575 | inst->src[i].type != BRW_REGISTER_TYPE_UD || |
||
1576 | !inst->src[i].negate); |
||
1577 | } |
||
1578 | dst = brw_reg_from_fs_reg(&inst->dst); |
||
1579 | |||
1580 | brw_set_default_predicate_control(p, inst->predicate); |
||
1581 | brw_set_default_predicate_inverse(p, inst->predicate_inverse); |
||
1582 | brw_set_default_flag_reg(p, 0, inst->flag_subreg); |
||
1583 | brw_set_default_saturate(p, inst->saturate); |
||
1584 | brw_set_default_mask_control(p, inst->force_writemask_all); |
||
1585 | brw_set_default_acc_write_control(p, inst->writes_accumulator); |
||
1586 | brw_set_default_exec_size(p, cvt(inst->exec_size) - 1); |
||
1587 | |||
1588 | switch (inst->exec_size) { |
||
1589 | case 1: |
||
1590 | case 2: |
||
1591 | case 4: |
||
1592 | assert(inst->force_writemask_all); |
||
1593 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1594 | break; |
||
1595 | case 8: |
||
1596 | if (inst->force_sechalf) { |
||
1597 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
1598 | } else { |
||
1599 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1600 | } |
||
1601 | break; |
||
1602 | case 16: |
||
1603 | case 32: |
||
1604 | /* If the instruction writes to more than one register, it needs to |
||
1605 | * be a "compressed" instruction on Gen <= 5. |
||
1606 | */ |
||
1607 | if (inst->exec_size * inst->dst.stride * type_sz(inst->dst.type) > 32) |
||
1608 | brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
||
1609 | else |
||
1610 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1611 | break; |
||
1612 | default: |
||
1613 | unreachable("Invalid instruction width"); |
||
1614 | } |
||
1615 | |||
1616 | switch (inst->opcode) { |
||
1617 | case BRW_OPCODE_MOV: |
||
1618 | brw_MOV(p, dst, src[0]); |
||
1619 | break; |
||
1620 | case BRW_OPCODE_ADD: |
||
1621 | brw_ADD(p, dst, src[0], src[1]); |
||
1622 | break; |
||
1623 | case BRW_OPCODE_MUL: |
||
1624 | brw_MUL(p, dst, src[0], src[1]); |
||
1625 | break; |
||
1626 | case BRW_OPCODE_AVG: |
||
1627 | brw_AVG(p, dst, src[0], src[1]); |
||
1628 | break; |
||
1629 | case BRW_OPCODE_MACH: |
||
1630 | brw_MACH(p, dst, src[0], src[1]); |
||
1631 | break; |
||
1632 | |||
1633 | case BRW_OPCODE_LINE: |
||
1634 | brw_LINE(p, dst, src[0], src[1]); |
||
1635 | break; |
||
1636 | |||
1637 | case BRW_OPCODE_MAD: |
||
1638 | assert(devinfo->gen >= 6); |
||
1639 | brw_set_default_access_mode(p, BRW_ALIGN_16); |
||
1640 | if (dispatch_width == 16 && !devinfo->supports_simd16_3src) { |
||
1641 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
1642 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1643 | brw_inst *f = brw_MAD(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2])); |
||
1644 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
1645 | brw_inst *s = brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); |
||
1646 | brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
||
1647 | |||
1648 | if (inst->conditional_mod) { |
||
1649 | brw_inst_set_cond_modifier(p->devinfo, f, inst->conditional_mod); |
||
1650 | brw_inst_set_cond_modifier(p->devinfo, s, inst->conditional_mod); |
||
1651 | multiple_instructions_emitted = true; |
||
1652 | } |
||
1653 | } else { |
||
1654 | brw_MAD(p, dst, src[0], src[1], src[2]); |
||
1655 | } |
||
1656 | brw_set_default_access_mode(p, BRW_ALIGN_1); |
||
1657 | break; |
||
1658 | |||
1659 | case BRW_OPCODE_LRP: |
||
1660 | assert(devinfo->gen >= 6); |
||
1661 | brw_set_default_access_mode(p, BRW_ALIGN_16); |
||
1662 | if (dispatch_width == 16 && !devinfo->supports_simd16_3src) { |
||
1663 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
1664 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1665 | brw_inst *f = brw_LRP(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2])); |
||
1666 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
1667 | brw_inst *s = brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); |
||
1668 | brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
||
1669 | |||
1670 | if (inst->conditional_mod) { |
||
1671 | brw_inst_set_cond_modifier(p->devinfo, f, inst->conditional_mod); |
||
1672 | brw_inst_set_cond_modifier(p->devinfo, s, inst->conditional_mod); |
||
1673 | multiple_instructions_emitted = true; |
||
1674 | } |
||
1675 | } else { |
||
1676 | brw_LRP(p, dst, src[0], src[1], src[2]); |
||
1677 | } |
||
1678 | brw_set_default_access_mode(p, BRW_ALIGN_1); |
||
1679 | break; |
||
1680 | |||
1681 | case BRW_OPCODE_FRC: |
||
1682 | brw_FRC(p, dst, src[0]); |
||
1683 | break; |
||
1684 | case BRW_OPCODE_RNDD: |
||
1685 | brw_RNDD(p, dst, src[0]); |
||
1686 | break; |
||
1687 | case BRW_OPCODE_RNDE: |
||
1688 | brw_RNDE(p, dst, src[0]); |
||
1689 | break; |
||
1690 | case BRW_OPCODE_RNDZ: |
||
1691 | brw_RNDZ(p, dst, src[0]); |
||
1692 | break; |
||
1693 | |||
1694 | case BRW_OPCODE_AND: |
||
1695 | brw_AND(p, dst, src[0], src[1]); |
||
1696 | break; |
||
1697 | case BRW_OPCODE_OR: |
||
1698 | brw_OR(p, dst, src[0], src[1]); |
||
1699 | break; |
||
1700 | case BRW_OPCODE_XOR: |
||
1701 | brw_XOR(p, dst, src[0], src[1]); |
||
1702 | break; |
||
1703 | case BRW_OPCODE_NOT: |
||
1704 | brw_NOT(p, dst, src[0]); |
||
1705 | break; |
||
1706 | case BRW_OPCODE_ASR: |
||
1707 | brw_ASR(p, dst, src[0], src[1]); |
||
1708 | break; |
||
1709 | case BRW_OPCODE_SHR: |
||
1710 | brw_SHR(p, dst, src[0], src[1]); |
||
1711 | break; |
||
1712 | case BRW_OPCODE_SHL: |
||
1713 | brw_SHL(p, dst, src[0], src[1]); |
||
1714 | break; |
||
1715 | case BRW_OPCODE_F32TO16: |
||
1716 | assert(devinfo->gen >= 7); |
||
1717 | brw_F32TO16(p, dst, src[0]); |
||
1718 | break; |
||
1719 | case BRW_OPCODE_F16TO32: |
||
1720 | assert(devinfo->gen >= 7); |
||
1721 | brw_F16TO32(p, dst, src[0]); |
||
1722 | break; |
||
1723 | case BRW_OPCODE_CMP: |
||
1724 | /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says |
||
1725 | * that when the destination is a GRF that the dependency-clear bit on |
||
1726 | * the flag register is cleared early. |
||
1727 | * |
||
1728 | * Suggested workarounds are to disable coissuing CMP instructions |
||
1729 | * or to split CMP(16) instructions into two CMP(8) instructions. |
||
1730 | * |
||
1731 | * We choose to split into CMP(8) instructions since disabling |
||
1732 | * coissuing would affect CMP instructions not otherwise affected by |
||
1733 | * the errata. |
||
1734 | */ |
||
1735 | if (dispatch_width == 16 && devinfo->gen == 7 && !devinfo->is_haswell) { |
||
1736 | if (dst.file == BRW_GENERAL_REGISTER_FILE) { |
||
1737 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
1738 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1739 | brw_CMP(p, firsthalf(dst), inst->conditional_mod, |
||
1740 | firsthalf(src[0]), firsthalf(src[1])); |
||
1741 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
1742 | brw_CMP(p, sechalf(dst), inst->conditional_mod, |
||
1743 | sechalf(src[0]), sechalf(src[1])); |
||
1744 | brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
||
1745 | |||
1746 | multiple_instructions_emitted = true; |
||
1747 | } else if (dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { |
||
1748 | /* For unknown reasons, the aforementioned workaround is not |
||
1749 | * sufficient. Overriding the type when the destination is the |
||
1750 | * null register is necessary but not sufficient by itself. |
||
1751 | */ |
||
1752 | assert(dst.nr == BRW_ARF_NULL); |
||
1753 | dst.type = BRW_REGISTER_TYPE_D; |
||
1754 | brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); |
||
1755 | } else { |
||
1756 | unreachable("not reached"); |
||
1757 | } |
||
1758 | } else { |
||
1759 | brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); |
||
1760 | } |
||
1761 | break; |
||
1762 | case BRW_OPCODE_SEL: |
||
1763 | brw_SEL(p, dst, src[0], src[1]); |
||
1764 | break; |
||
1765 | case BRW_OPCODE_BFREV: |
||
1766 | assert(devinfo->gen >= 7); |
||
1767 | /* BFREV only supports UD type for src and dst. */ |
||
1768 | brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), |
||
1769 | retype(src[0], BRW_REGISTER_TYPE_UD)); |
||
1770 | break; |
||
1771 | case BRW_OPCODE_FBH: |
||
1772 | assert(devinfo->gen >= 7); |
||
1773 | /* FBH only supports UD type for dst. */ |
||
1774 | brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); |
||
1775 | break; |
||
1776 | case BRW_OPCODE_FBL: |
||
1777 | assert(devinfo->gen >= 7); |
||
1778 | /* FBL only supports UD type for dst. */ |
||
1779 | brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); |
||
1780 | break; |
||
1781 | case BRW_OPCODE_CBIT: |
||
1782 | assert(devinfo->gen >= 7); |
||
1783 | /* CBIT only supports UD type for dst. */ |
||
1784 | brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); |
||
1785 | break; |
||
1786 | case BRW_OPCODE_ADDC: |
||
1787 | assert(devinfo->gen >= 7); |
||
1788 | brw_ADDC(p, dst, src[0], src[1]); |
||
1789 | break; |
||
1790 | case BRW_OPCODE_SUBB: |
||
1791 | assert(devinfo->gen >= 7); |
||
1792 | brw_SUBB(p, dst, src[0], src[1]); |
||
1793 | break; |
||
1794 | case BRW_OPCODE_MAC: |
||
1795 | brw_MAC(p, dst, src[0], src[1]); |
||
1796 | break; |
||
1797 | |||
1798 | case BRW_OPCODE_BFE: |
||
1799 | assert(devinfo->gen >= 7); |
||
1800 | brw_set_default_access_mode(p, BRW_ALIGN_16); |
||
1801 | if (dispatch_width == 16 && !devinfo->supports_simd16_3src) { |
||
1802 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
1803 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1804 | brw_BFE(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2])); |
||
1805 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
1806 | brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); |
||
1807 | brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
||
1808 | } else { |
||
1809 | brw_BFE(p, dst, src[0], src[1], src[2]); |
||
1810 | } |
||
1811 | brw_set_default_access_mode(p, BRW_ALIGN_1); |
||
1812 | break; |
||
1813 | |||
1814 | case BRW_OPCODE_BFI1: |
||
1815 | assert(devinfo->gen >= 7); |
||
1816 | /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we |
||
1817 | * should |
||
1818 | * |
||
1819 | * "Force BFI instructions to be executed always in SIMD8." |
||
1820 | */ |
||
1821 | if (dispatch_width == 16 && devinfo->is_haswell) { |
||
1822 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
1823 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1824 | brw_BFI1(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1])); |
||
1825 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
1826 | brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1])); |
||
1827 | brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
||
1828 | } else { |
||
1829 | brw_BFI1(p, dst, src[0], src[1]); |
||
1830 | } |
||
1831 | break; |
||
1832 | case BRW_OPCODE_BFI2: |
||
1833 | assert(devinfo->gen >= 7); |
||
1834 | brw_set_default_access_mode(p, BRW_ALIGN_16); |
||
1835 | /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we |
||
1836 | * should |
||
1837 | * |
||
1838 | * "Force BFI instructions to be executed always in SIMD8." |
||
1839 | * |
||
1840 | * Otherwise we would be able to emit compressed instructions like we |
||
1841 | * do for the other three-source instructions. |
||
1842 | */ |
||
1843 | if (dispatch_width == 16 && |
||
1844 | (devinfo->is_haswell || !devinfo->supports_simd16_3src)) { |
||
1845 | brw_set_default_exec_size(p, BRW_EXECUTE_8); |
||
1846 | brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); |
||
1847 | brw_BFI2(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2])); |
||
1848 | brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); |
||
1849 | brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); |
||
1850 | brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
||
1851 | } else { |
||
1852 | brw_BFI2(p, dst, src[0], src[1], src[2]); |
||
1853 | } |
||
1854 | brw_set_default_access_mode(p, BRW_ALIGN_1); |
||
1855 | break; |
||
1856 | |||
1857 | case BRW_OPCODE_IF: |
||
1858 | if (inst->src[0].file != BAD_FILE) { |
||
1859 | /* The instruction has an embedded compare (only allowed on gen6) */ |
||
1860 | assert(devinfo->gen == 6); |
||
1861 | gen6_IF(p, inst->conditional_mod, src[0], src[1]); |
||
1862 | } else { |
||
1863 | brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8); |
||
1864 | } |
||
1865 | break; |
||
1866 | |||
1867 | case BRW_OPCODE_ELSE: |
||
1868 | brw_ELSE(p); |
||
1869 | break; |
||
1870 | case BRW_OPCODE_ENDIF: |
||
1871 | brw_ENDIF(p); |
||
1872 | break; |
||
1873 | |||
1874 | case BRW_OPCODE_DO: |
||
1875 | brw_DO(p, BRW_EXECUTE_8); |
||
1876 | break; |
||
1877 | |||
1878 | case BRW_OPCODE_BREAK: |
||
1879 | brw_BREAK(p); |
||
1880 | brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); |
||
1881 | break; |
||
1882 | case BRW_OPCODE_CONTINUE: |
||
1883 | brw_CONT(p); |
||
1884 | brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); |
||
1885 | break; |
||
1886 | |||
1887 | case BRW_OPCODE_WHILE: |
||
1888 | brw_WHILE(p); |
||
1889 | loop_count++; |
||
1890 | break; |
||
1891 | |||
1892 | case SHADER_OPCODE_RCP: |
||
1893 | case SHADER_OPCODE_RSQ: |
||
1894 | case SHADER_OPCODE_SQRT: |
||
1895 | case SHADER_OPCODE_EXP2: |
||
1896 | case SHADER_OPCODE_LOG2: |
||
1897 | case SHADER_OPCODE_SIN: |
||
1898 | case SHADER_OPCODE_COS: |
||
1899 | assert(devinfo->gen < 6 || inst->mlen == 0); |
||
1900 | assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); |
||
1901 | if (devinfo->gen >= 7) { |
||
1902 | gen6_math(p, dst, brw_math_function(inst->opcode), src[0], |
||
1903 | brw_null_reg()); |
||
1904 | } else if (devinfo->gen == 6) { |
||
1905 | generate_math_gen6(inst, dst, src[0], brw_null_reg()); |
||
1906 | } else if (devinfo->gen == 5 || devinfo->is_g4x) { |
||
1907 | generate_math_g45(inst, dst, src[0]); |
||
1908 | } else { |
||
1909 | generate_math_gen4(inst, dst, src[0]); |
||
1910 | } |
||
1911 | break; |
||
1912 | case SHADER_OPCODE_INT_QUOTIENT: |
||
1913 | case SHADER_OPCODE_INT_REMAINDER: |
||
1914 | case SHADER_OPCODE_POW: |
||
1915 | assert(devinfo->gen < 6 || inst->mlen == 0); |
||
1916 | assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); |
||
1917 | if (devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) { |
||
1918 | gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); |
||
1919 | } else if (devinfo->gen >= 6) { |
||
1920 | generate_math_gen6(inst, dst, src[0], src[1]); |
||
1921 | } else { |
||
1922 | generate_math_gen4(inst, dst, src[0]); |
||
1923 | } |
||
1924 | break; |
||
1925 | case FS_OPCODE_CINTERP: |
||
1926 | brw_MOV(p, dst, src[0]); |
||
1927 | break; |
||
1928 | case FS_OPCODE_LINTERP: |
||
1929 | generate_linterp(inst, dst, src); |
||
1930 | break; |
||
1931 | case FS_OPCODE_PIXEL_X: |
||
1932 | assert(src[0].type == BRW_REGISTER_TYPE_UW); |
||
1933 | src[0].subnr = 0 * type_sz(src[0].type); |
||
1934 | brw_MOV(p, dst, stride(src[0], 8, 4, 1)); |
||
1935 | break; |
||
1936 | case FS_OPCODE_PIXEL_Y: |
||
1937 | assert(src[0].type == BRW_REGISTER_TYPE_UW); |
||
1938 | src[0].subnr = 4 * type_sz(src[0].type); |
||
1939 | brw_MOV(p, dst, stride(src[0], 8, 4, 1)); |
||
1940 | break; |
||
1941 | case SHADER_OPCODE_TEX: |
||
1942 | case FS_OPCODE_TXB: |
||
1943 | case SHADER_OPCODE_TXD: |
||
1944 | case SHADER_OPCODE_TXF: |
||
1945 | case SHADER_OPCODE_TXF_CMS: |
||
1946 | case SHADER_OPCODE_TXF_UMS: |
||
1947 | case SHADER_OPCODE_TXF_MCS: |
||
1948 | case SHADER_OPCODE_TXL: |
||
1949 | case SHADER_OPCODE_TXS: |
||
1950 | case SHADER_OPCODE_LOD: |
||
1951 | case SHADER_OPCODE_TG4: |
||
1952 | case SHADER_OPCODE_TG4_OFFSET: |
||
1953 | generate_tex(inst, dst, src[0], src[1]); |
||
1954 | break; |
||
1955 | case FS_OPCODE_DDX_COARSE: |
||
1956 | case FS_OPCODE_DDX_FINE: |
||
1957 | generate_ddx(inst->opcode, dst, src[0]); |
||
1958 | break; |
||
1959 | case FS_OPCODE_DDY_COARSE: |
||
1960 | case FS_OPCODE_DDY_FINE: |
||
1961 | assert(src[1].file == BRW_IMMEDIATE_VALUE); |
||
1962 | generate_ddy(inst->opcode, dst, src[0], src[1].dw1.ud); |
||
1963 | break; |
||
1964 | |||
1965 | case SHADER_OPCODE_GEN4_SCRATCH_WRITE: |
||
1966 | generate_scratch_write(inst, src[0]); |
||
1967 | spill_count++; |
||
1968 | break; |
||
1969 | |||
1970 | case SHADER_OPCODE_GEN4_SCRATCH_READ: |
||
1971 | generate_scratch_read(inst, dst); |
||
1972 | fill_count++; |
||
1973 | break; |
||
1974 | |||
1975 | case SHADER_OPCODE_GEN7_SCRATCH_READ: |
||
1976 | generate_scratch_read_gen7(inst, dst); |
||
1977 | fill_count++; |
||
1978 | break; |
||
1979 | |||
1980 | case SHADER_OPCODE_URB_WRITE_SIMD8: |
||
1981 | generate_urb_write(inst, src[0]); |
||
1982 | break; |
||
1983 | |||
1984 | case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: |
||
1985 | generate_uniform_pull_constant_load(inst, dst, src[0], src[1]); |
||
1986 | break; |
||
1987 | |||
1988 | case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: |
||
1989 | generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]); |
||
1990 | break; |
||
1991 | |||
1992 | case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD: |
||
1993 | generate_varying_pull_constant_load(inst, dst, src[0], src[1]); |
||
1994 | break; |
||
1995 | |||
1996 | case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: |
||
1997 | generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]); |
||
1998 | break; |
||
1999 | |||
2000 | case FS_OPCODE_REP_FB_WRITE: |
||
2001 | case FS_OPCODE_FB_WRITE: |
||
2002 | generate_fb_write(inst, src[0]); |
||
2003 | break; |
||
2004 | |||
2005 | case FS_OPCODE_BLORP_FB_WRITE: |
||
2006 | generate_blorp_fb_write(inst); |
||
2007 | break; |
||
2008 | |||
2009 | case FS_OPCODE_MOV_DISPATCH_TO_FLAGS: |
||
2010 | generate_mov_dispatch_to_flags(inst); |
||
2011 | break; |
||
2012 | |||
2013 | case FS_OPCODE_DISCARD_JUMP: |
||
2014 | generate_discard_jump(inst); |
||
2015 | break; |
||
2016 | |||
2017 | case SHADER_OPCODE_SHADER_TIME_ADD: |
||
2018 | generate_shader_time_add(inst, src[0], src[1], src[2]); |
||
2019 | break; |
||
2020 | |||
2021 | case SHADER_OPCODE_UNTYPED_ATOMIC: |
||
2022 | assert(src[1].file == BRW_IMMEDIATE_VALUE && |
||
2023 | src[2].file == BRW_IMMEDIATE_VALUE); |
||
2024 | brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud, |
||
2025 | inst->mlen, !inst->dst.is_null()); |
||
2026 | brw_mark_surface_used(prog_data, src[1].dw1.ud); |
||
2027 | break; |
||
2028 | |||
2029 | case SHADER_OPCODE_UNTYPED_SURFACE_READ: |
||
2030 | assert(src[1].file == BRW_IMMEDIATE_VALUE && |
||
2031 | src[2].file == BRW_IMMEDIATE_VALUE); |
||
2032 | brw_untyped_surface_read(p, dst, src[0], src[1], |
||
2033 | inst->mlen, src[2].dw1.ud); |
||
2034 | brw_mark_surface_used(prog_data, src[1].dw1.ud); |
||
2035 | break; |
||
2036 | |||
2037 | case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: |
||
2038 | assert(src[2].file == BRW_IMMEDIATE_VALUE); |
||
2039 | brw_untyped_surface_write(p, src[0], src[1], |
||
2040 | inst->mlen, src[2].dw1.ud); |
||
2041 | break; |
||
2042 | |||
2043 | case SHADER_OPCODE_TYPED_ATOMIC: |
||
2044 | assert(src[2].file == BRW_IMMEDIATE_VALUE); |
||
2045 | brw_typed_atomic(p, dst, src[0], src[1], |
||
2046 | src[2].dw1.ud, inst->mlen, !inst->dst.is_null()); |
||
2047 | break; |
||
2048 | |||
2049 | case SHADER_OPCODE_TYPED_SURFACE_READ: |
||
2050 | assert(src[2].file == BRW_IMMEDIATE_VALUE); |
||
2051 | brw_typed_surface_read(p, dst, src[0], src[1], |
||
2052 | inst->mlen, src[2].dw1.ud); |
||
2053 | break; |
||
2054 | |||
2055 | case SHADER_OPCODE_TYPED_SURFACE_WRITE: |
||
2056 | assert(src[2].file == BRW_IMMEDIATE_VALUE); |
||
2057 | brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].dw1.ud); |
||
2058 | break; |
||
2059 | |||
2060 | case SHADER_OPCODE_MEMORY_FENCE: |
||
2061 | brw_memory_fence(p, dst); |
||
2062 | break; |
||
2063 | |||
2064 | case FS_OPCODE_SET_SIMD4X2_OFFSET: |
||
2065 | generate_set_simd4x2_offset(inst, dst, src[0]); |
||
2066 | break; |
||
2067 | |||
2068 | case SHADER_OPCODE_FIND_LIVE_CHANNEL: |
||
2069 | brw_find_live_channel(p, dst); |
||
2070 | break; |
||
2071 | |||
2072 | case SHADER_OPCODE_BROADCAST: |
||
2073 | brw_broadcast(p, dst, src[0], src[1]); |
||
2074 | break; |
||
2075 | |||
2076 | case FS_OPCODE_SET_OMASK: |
||
2077 | generate_set_omask(inst, dst, src[0]); |
||
2078 | break; |
||
2079 | |||
2080 | case FS_OPCODE_SET_SAMPLE_ID: |
||
2081 | generate_set_sample_id(inst, dst, src[0], src[1]); |
||
2082 | break; |
||
2083 | |||
2084 | case FS_OPCODE_PACK_HALF_2x16_SPLIT: |
||
2085 | generate_pack_half_2x16_split(inst, dst, src[0], src[1]); |
||
2086 | break; |
||
2087 | |||
2088 | case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X: |
||
2089 | case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y: |
||
2090 | generate_unpack_half_2x16_split(inst, dst, src[0]); |
||
2091 | break; |
||
2092 | |||
2093 | case FS_OPCODE_PLACEHOLDER_HALT: |
||
2094 | /* This is the place where the final HALT needs to be inserted if |
||
2095 | * we've emitted any discards. If not, this will emit no code. |
||
2096 | */ |
||
2097 | if (!patch_discard_jumps_to_fb_writes()) { |
||
2098 | if (unlikely(debug_flag)) { |
||
2099 | annotation.ann_count--; |
||
2100 | } |
||
2101 | } |
||
2102 | break; |
||
2103 | |||
2104 | case FS_OPCODE_INTERPOLATE_AT_CENTROID: |
||
2105 | generate_pixel_interpolator_query(inst, dst, src[0], src[1], |
||
2106 | GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID); |
||
2107 | break; |
||
2108 | |||
2109 | case FS_OPCODE_INTERPOLATE_AT_SAMPLE: |
||
2110 | generate_pixel_interpolator_query(inst, dst, src[0], src[1], |
||
2111 | GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE); |
||
2112 | break; |
||
2113 | |||
2114 | case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: |
||
2115 | generate_pixel_interpolator_query(inst, dst, src[0], src[1], |
||
2116 | GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET); |
||
2117 | break; |
||
2118 | |||
2119 | case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: |
||
2120 | generate_pixel_interpolator_query(inst, dst, src[0], src[1], |
||
2121 | GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET); |
||
2122 | break; |
||
2123 | |||
2124 | case CS_OPCODE_CS_TERMINATE: |
||
2125 | generate_cs_terminate(inst, src[0]); |
||
2126 | break; |
||
2127 | |||
2128 | default: |
||
2129 | unreachable("Unsupported opcode"); |
||
2130 | |||
2131 | case SHADER_OPCODE_LOAD_PAYLOAD: |
||
2132 | unreachable("Should be lowered by lower_load_payload()"); |
||
2133 | } |
||
2134 | |||
2135 | if (multiple_instructions_emitted) |
||
2136 | continue; |
||
2137 | |||
2138 | if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { |
||
2139 | assert(p->next_insn_offset == last_insn_offset + 16 || |
||
2140 | !"conditional_mod, no_dd_check, or no_dd_clear set for IR " |
||
2141 | "emitting more than 1 instruction"); |
||
2142 | |||
2143 | brw_inst *last = &p->store[last_insn_offset / 16]; |
||
2144 | |||
2145 | if (inst->conditional_mod) |
||
2146 | brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); |
||
2147 | brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); |
||
2148 | brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); |
||
2149 | } |
||
2150 | } |
||
2151 | |||
2152 | brw_set_uip_jip(p); |
||
2153 | annotation_finalize(&annotation, p->next_insn_offset); |
||
2154 | |||
2155 | int before_size = p->next_insn_offset - start_offset; |
||
2156 | brw_compact_instructions(p, start_offset, annotation.ann_count, |
||
2157 | annotation.ann); |
||
2158 | int after_size = p->next_insn_offset - start_offset; |
||
2159 | |||
2160 | if (unlikely(debug_flag)) { |
||
2161 | fprintf(stderr, "Native code for %s\n" |
||
2162 | "SIMD%d shader: %d instructions. %d loops. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d" |
||
2163 | " bytes (%.0f%%)\n", |
||
2164 | shader_name, dispatch_width, before_size / 16, loop_count, |
||
2165 | spill_count, fill_count, promoted_constants, before_size, after_size, |
||
2166 | 100.0f * (before_size - after_size) / before_size); |
||
2167 | |||
2168 | dump_assembly(p->store, annotation.ann_count, annotation.ann, |
||
2169 | p->devinfo, prog); |
||
2170 | ralloc_free(annotation.ann); |
||
2171 | } |
||
2172 | |||
2173 | static GLuint msg_id = 0; |
||
2174 | _mesa_gl_debug(&brw->ctx, &msg_id, |
||
2175 | MESA_DEBUG_SOURCE_SHADER_COMPILER, |
||
2176 | MESA_DEBUG_TYPE_OTHER, |
||
2177 | MESA_DEBUG_SEVERITY_NOTIFICATION, |
||
2178 | "%s SIMD%d shader: %d inst, %d loops, %d:%d spills:fills, " |
||
2179 | "Promoted %u constants, compacted %d to %d bytes.\n", |
||
2180 | stage_abbrev, dispatch_width, before_size / 16, loop_count, |
||
2181 | spill_count, fill_count, promoted_constants, before_size, after_size); |
||
2182 | |||
2183 | return start_offset; |
||
2184 | } |
||
2185 | |||
2186 | const unsigned * |
||
2187 | fs_generator::get_assembly(unsigned int *assembly_size) |
||
2188 | { |
||
2189 | return brw_get_program(p, assembly_size); |
||
2190 | }>>=>>>>><>>>=>=>=>26)); |