WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/mesa-10.6.0/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp

Rev	Author	Line No.	Line
5564	serge	1	/*
		2	* Copyright © 2012 Intel Corporation
		3	*
		4	* Permission is hereby granted, free of charge, to any person obtaining a
		5	* copy of this software and associated documentation files (the "Software"),
		6	* to deal in the Software without restriction, including without limitation
		7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		8	* and/or sell copies of the Software, and to permit persons to whom the
		9	* Software is furnished to do so, subject to the following conditions:
		10	*
		11	* The above copyright notice and this permission notice (including the next
		12	* paragraph) shall be included in all copies or substantial portions of the
		13	* Software.
		14	*
		15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		16	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		17	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		18	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		19	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
		20	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
		21	* IN THE SOFTWARE.
		22	*/
		23
		24	#include "main/teximage.h"
		25	#include "main/fbobject.h"
		26	#include "main/renderbuffer.h"
		27
		28	#include "intel_fbo.h"
		29
		30	#include "brw_blorp.h"
		31	#include "brw_context.h"
		32	#include "brw_blorp_blit_eu.h"
		33	#include "brw_state.h"
		34	#include "brw_meta_util.h"
		35
		36	#define FILE_DEBUG_FLAG DEBUG_BLORP
		37
		38	static struct intel_mipmap_tree *
		39	find_miptree(GLbitfield buffer_bit, struct intel_renderbuffer *irb)
		40	{
		41	struct intel_mipmap_tree *mt = irb->mt;
		42	if (buffer_bit == GL_STENCIL_BUFFER_BIT && mt->stencil_mt)
		43	mt = mt->stencil_mt;
		44	return mt;
		45	}
		46
		47
		48	/**
		49	* Note: if the src (or dst) is a 2D multisample array texture on Gen7+ using
		50	* INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, src_layer (dst_layer) is
		51	* the physical layer holding sample 0. So, for example, if
		52	* src_mt->num_samples == 4, then logical layer n corresponds to src_layer ==
		53	* 4*n.
		54	*/
		55	void
		56	brw_blorp_blit_miptrees(struct brw_context *brw,
		57	struct intel_mipmap_tree *src_mt,
		58	unsigned src_level, unsigned src_layer,
		59	mesa_format src_format,
		60	struct intel_mipmap_tree *dst_mt,
		61	unsigned dst_level, unsigned dst_layer,
		62	mesa_format dst_format,
		63	float src_x0, float src_y0,
		64	float src_x1, float src_y1,
		65	float dst_x0, float dst_y0,
		66	float dst_x1, float dst_y1,
		67	GLenum filter, bool mirror_x, bool mirror_y)
		68	{
		69	/* Get ready to blit. This includes depth resolving the src and dst
		70	* buffers if necessary. Note: it's not necessary to do a color resolve on
		71	* the destination buffer because we use the standard render path to render
		72	* to destination color buffers, and the standard render path is
		73	* fast-color-aware.
		74	*/
		75	intel_miptree_resolve_color(brw, src_mt);
		76	intel_miptree_slice_resolve_depth(brw, src_mt, src_level, src_layer);
		77	intel_miptree_slice_resolve_depth(brw, dst_mt, dst_level, dst_layer);
		78
		79	DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f)"
		80	"to %dx %s mt %p %d %d (%f,%f) (%f,%f) (flip %d,%d)\n",
		81	__func__,
		82	src_mt->num_samples, _mesa_get_format_name(src_mt->format), src_mt,
		83	src_level, src_layer, src_x0, src_y0, src_x1, src_y1,
		84	dst_mt->num_samples, _mesa_get_format_name(dst_mt->format), dst_mt,
		85	dst_level, dst_layer, dst_x0, dst_y0, dst_x1, dst_y1,
		86	mirror_x, mirror_y);
		87
		88	brw_blorp_blit_params params(brw,
		89	src_mt, src_level, src_layer, src_format,
		90	dst_mt, dst_level, dst_layer, dst_format,
		91	src_x0, src_y0,
		92	src_x1, src_y1,
		93	dst_x0, dst_y0,
		94	dst_x1, dst_y1,
		95	filter, mirror_x, mirror_y);
		96	brw_blorp_exec(brw, ¶ms);
		97
		98	intel_miptree_slice_set_needs_hiz_resolve(dst_mt, dst_level, dst_layer);
		99	}
		100
		101	static void
		102	do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit,
		103	struct intel_renderbuffer *src_irb, mesa_format src_format,
		104	struct intel_renderbuffer *dst_irb, mesa_format dst_format,
		105	GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat srcY1,
		106	GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat dstY1,
		107	GLenum filter, bool mirror_x, bool mirror_y)
		108	{
		109	/* Find source/dst miptrees */
		110	struct intel_mipmap_tree *src_mt = find_miptree(buffer_bit, src_irb);
		111	struct intel_mipmap_tree *dst_mt = find_miptree(buffer_bit, dst_irb);
		112
		113	/* Do the blit */
		114	brw_blorp_blit_miptrees(brw,
		115	src_mt, src_irb->mt_level, src_irb->mt_layer,
		116	src_format,
		117	dst_mt, dst_irb->mt_level, dst_irb->mt_layer,
		118	dst_format,
		119	srcX0, srcY0, srcX1, srcY1,
		120	dstX0, dstY0, dstX1, dstY1,
		121	filter, mirror_x, mirror_y);
		122
		123	dst_irb->need_downsample = true;
		124	}
		125
		126	static bool
		127	try_blorp_blit(struct brw_context *brw,
		128	const struct gl_framebuffer *read_fb,
		129	const struct gl_framebuffer *draw_fb,
		130	GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat srcY1,
		131	GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat dstY1,
		132	GLenum filter, GLbitfield buffer_bit)
		133	{
		134	struct gl_context *ctx = &brw->ctx;
		135
		136	/* Sync up the state of window system buffers. We need to do this before
		137	* we go looking for the buffers.
		138	*/
		139	intel_prepare_render(brw);
		140
		141	bool mirror_x, mirror_y;
		142	if (brw_meta_mirror_clip_and_scissor(ctx, read_fb, draw_fb,
		143	&srcX0, &srcY0, &srcX1, &srcY1,
		144	&dstX0, &dstY0, &dstX1, &dstY1,
		145	&mirror_x, &mirror_y))
		146	return true;
		147
		148	/* Find buffers */
		149	struct intel_renderbuffer *src_irb;
		150	struct intel_renderbuffer *dst_irb;
		151	struct intel_mipmap_tree *src_mt;
		152	struct intel_mipmap_tree *dst_mt;
		153	switch (buffer_bit) {
		154	case GL_COLOR_BUFFER_BIT:
		155	src_irb = intel_renderbuffer(read_fb->_ColorReadBuffer);
		156	for (unsigned i = 0; i < draw_fb->_NumColorDrawBuffers; ++i) {
		157	dst_irb = intel_renderbuffer(draw_fb->_ColorDrawBuffers[i]);
		158	if (dst_irb)
		159	do_blorp_blit(brw, buffer_bit,
		160	src_irb, src_irb->Base.Base.Format,
		161	dst_irb, dst_irb->Base.Base.Format,
		162	srcX0, srcY0, srcX1, srcY1,
		163	dstX0, dstY0, dstX1, dstY1,
		164	filter, mirror_x, mirror_y);
		165	}
		166	break;
		167	case GL_DEPTH_BUFFER_BIT:
		168	src_irb =
		169	intel_renderbuffer(read_fb->Attachment[BUFFER_DEPTH].Renderbuffer);
		170	dst_irb =
		171	intel_renderbuffer(draw_fb->Attachment[BUFFER_DEPTH].Renderbuffer);
		172	src_mt = find_miptree(buffer_bit, src_irb);
		173	dst_mt = find_miptree(buffer_bit, dst_irb);
		174
		175	/* We can't handle format conversions between Z24 and other formats
		176	* since we have to lie about the surface format. See the comments in
		177	* brw_blorp_surface_info::set().
		178	*/
		179	if ((src_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT) !=
		180	(dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT))
		181	return false;
		182
		183	do_blorp_blit(brw, buffer_bit, src_irb, MESA_FORMAT_NONE,
		184	dst_irb, MESA_FORMAT_NONE, srcX0, srcY0,
		185	srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
		186	filter, mirror_x, mirror_y);
		187	break;
		188	case GL_STENCIL_BUFFER_BIT:
		189	src_irb =
		190	intel_renderbuffer(read_fb->Attachment[BUFFER_STENCIL].Renderbuffer);
		191	dst_irb =
		192	intel_renderbuffer(draw_fb->Attachment[BUFFER_STENCIL].Renderbuffer);
		193	do_blorp_blit(brw, buffer_bit, src_irb, MESA_FORMAT_NONE,
		194	dst_irb, MESA_FORMAT_NONE, srcX0, srcY0,
		195	srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
		196	filter, mirror_x, mirror_y);
		197	break;
		198	default:
		199	unreachable("not reached");
		200	}
		201
		202	return true;
		203	}
		204
		205	bool
		206	brw_blorp_copytexsubimage(struct brw_context *brw,
		207	struct gl_renderbuffer *src_rb,
		208	struct gl_texture_image *dst_image,
		209	int slice,
		210	int srcX0, int srcY0,
		211	int dstX0, int dstY0,
		212	int width, int height)
		213	{
		214	struct gl_context *ctx = &brw->ctx;
		215	struct intel_renderbuffer *src_irb = intel_renderbuffer(src_rb);
		216	struct intel_texture_image *intel_image = intel_texture_image(dst_image);
		217
		218	/* Sync up the state of window system buffers. We need to do this before
		219	* we go looking at the src renderbuffer's miptree.
		220	*/
		221	intel_prepare_render(brw);
		222
		223	struct intel_mipmap_tree *src_mt = src_irb->mt;
		224	struct intel_mipmap_tree *dst_mt = intel_image->mt;
		225
		226	/* BLORP is only supported for Gen6-7. */
		227	if (brw->gen < 6 \|\| brw->gen > 7)
		228	return false;
		229
		230	if (_mesa_get_format_base_format(src_rb->Format) !=
		231	_mesa_get_format_base_format(dst_image->TexFormat)) {
		232	return false;
		233	}
		234
		235	/* We can't handle format conversions between Z24 and other formats since
		236	* we have to lie about the surface format. See the comments in
		237	* brw_blorp_surface_info::set().
		238	*/
		239	if ((src_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT) !=
		240	(dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT)) {
		241	return false;
		242	}
		243
		244	if (!brw->format_supported_as_render_target[dst_image->TexFormat])
		245	return false;
		246
		247	/* Source clipping shouldn't be necessary, since copytexsubimage (in
		248	* src/mesa/main/teximage.c) calls _mesa_clip_copytexsubimage() which
		249	* takes care of it.
		250	*
		251	* Destination clipping shouldn't be necessary since the restrictions on
		252	* glCopyTexSubImage prevent the user from specifying a destination rectangle
		253	* that falls outside the bounds of the destination texture.
		254	* See error_check_subtexture_dimensions().
		255	*/
		256
		257	int srcY1 = srcY0 + height;
		258	int srcX1 = srcX0 + width;
		259	int dstX1 = dstX0 + width;
		260	int dstY1 = dstY0 + height;
		261
		262	/* Account for the fact that in the system framebuffer, the origin is at
		263	* the lower left.
		264	*/
		265	bool mirror_y = false;
		266	if (_mesa_is_winsys_fbo(ctx->ReadBuffer)) {
		267	GLint tmp = src_rb->Height - srcY0;
		268	srcY0 = src_rb->Height - srcY1;
		269	srcY1 = tmp;
		270	mirror_y = true;
		271	}
		272
		273	/* Account for face selection and texture view MinLayer */
		274	int dst_slice = slice + dst_image->TexObject->MinLayer + dst_image->Face;
		275	int dst_level = dst_image->Level + dst_image->TexObject->MinLevel;
		276
		277	brw_blorp_blit_miptrees(brw,
		278	src_mt, src_irb->mt_level, src_irb->mt_layer,
		279	src_rb->Format,
		280	dst_mt, dst_level, dst_slice,
		281	dst_image->TexFormat,
		282	srcX0, srcY0, srcX1, srcY1,
		283	dstX0, dstY0, dstX1, dstY1,
		284	GL_NEAREST, false, mirror_y);
		285
		286	/* If we're copying to a packed depth stencil texture and the source
		287	* framebuffer has separate stencil, we need to also copy the stencil data
		288	* over.
		289	*/
		290	src_rb = ctx->ReadBuffer->Attachment[BUFFER_STENCIL].Renderbuffer;
		291	if (_mesa_get_format_bits(dst_image->TexFormat, GL_STENCIL_BITS) > 0 &&
		292	src_rb != NULL) {
		293	src_irb = intel_renderbuffer(src_rb);
		294	src_mt = src_irb->mt;
		295
		296	if (src_mt->stencil_mt)
		297	src_mt = src_mt->stencil_mt;
		298	if (dst_mt->stencil_mt)
		299	dst_mt = dst_mt->stencil_mt;
		300
		301	if (src_mt != dst_mt) {
		302	brw_blorp_blit_miptrees(brw,
		303	src_mt, src_irb->mt_level, src_irb->mt_layer,
		304	src_mt->format,
		305	dst_mt, dst_level, dst_slice,
		306	dst_mt->format,
		307	srcX0, srcY0, srcX1, srcY1,
		308	dstX0, dstY0, dstX1, dstY1,
		309	GL_NEAREST, false, mirror_y);
		310	}
		311	}
		312
		313	return true;
		314	}
		315
		316
		317	GLbitfield
		318	brw_blorp_framebuffer(struct brw_context *brw,
		319	struct gl_framebuffer *readFb,
		320	struct gl_framebuffer *drawFb,
		321	GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
		322	GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
		323	GLbitfield mask, GLenum filter)
		324	{
		325	/* BLORP is not supported before Gen6. */
		326	if (brw->gen < 6 \|\| brw->gen >= 8)
		327	return mask;
		328
		329	static GLbitfield buffer_bits[] = {
		330	GL_COLOR_BUFFER_BIT,
		331	GL_DEPTH_BUFFER_BIT,
		332	GL_STENCIL_BUFFER_BIT,
		333	};
		334
		335	for (unsigned int i = 0; i < ARRAY_SIZE(buffer_bits); ++i) {
		336	if ((mask & buffer_bits[i]) &&
		337	try_blorp_blit(brw, readFb, drawFb,
		338	srcX0, srcY0, srcX1, srcY1,
		339	dstX0, dstY0, dstX1, dstY1,
		340	filter, buffer_bits[i])) {
		341	mask &= ~buffer_bits[i];
		342	}
		343	}
		344
		345	return mask;
		346	}
		347
		348
		349	/**
		350	* Enum to specify the order of arguments in a sampler message
		351	*/
		352	enum sampler_message_arg
		353	{
		354	SAMPLER_MESSAGE_ARG_U_FLOAT,
		355	SAMPLER_MESSAGE_ARG_V_FLOAT,
		356	SAMPLER_MESSAGE_ARG_U_INT,
		357	SAMPLER_MESSAGE_ARG_V_INT,
		358	SAMPLER_MESSAGE_ARG_SI_INT,
		359	SAMPLER_MESSAGE_ARG_MCS_INT,
		360	SAMPLER_MESSAGE_ARG_ZERO_INT,
		361	};
		362
		363	/**
		364	* Generator for WM programs used in BLORP blits.
		365	*
		366	* The bulk of the work done by the WM program is to wrap and unwrap the
		367	* coordinate transformations used by the hardware to store surfaces in
		368	* memory. The hardware transforms a pixel location (X, Y, S) (where S is the
		369	* sample index for a multisampled surface) to a memory offset by the
		370	* following formulas:
		371	*
		372	* offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
		373	* (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
		374	*
		375	* For a single-sampled surface, or for a multisampled surface using
		376	* INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
		377	* function:
		378	*
		379	* encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
		380	* decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
		381	* encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
		382	* decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
		383	*
		384	* For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
		385	* embeds the sample number into bit 1 of the X and Y coordinates:
		386	*
		387	* encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
		388	* where X' = (X & ~0b1) << 1 \| (S & 0b1) << 1 \| (X & 0b1)
		389	* Y' = (Y & ~0b1 ) << 1 \| (S & 0b10) \| (Y & 0b1)
		390	* decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
		391	* where X' = (X & ~0b11) >> 1 \| (X & 0b1)
		392	* Y' = (Y & ~0b11) >> 1 \| (Y & 0b1)
		393	* S = (Y & 0b10) \| (X & 0b10) >> 1
		394	*
		395	* For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
		396	* embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
		397	* the Y coordinate:
		398	*
		399	* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
		400	* where X' = (X & ~0b1) << 2 \| (S & 0b100) \| (S & 0b1) << 1 \| (X & 0b1)
		401	* Y' = (Y & ~0b1) << 1 \| (S & 0b10) \| (Y & 0b1)
		402	* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
		403	* where X' = (X & ~0b111) >> 2 \| (X & 0b1)
		404	* Y' = (Y & ~0b11) >> 1 \| (Y & 0b1)
		405	* S = (X & 0b100) \| (Y & 0b10) \| (X & 0b10) >> 1
		406	*
		407	* For X tiling, tile() combines together the low-order bits of the X and Y
		408	* coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
		409	* bytes wide and 8 rows high:
		410	*
		411	* tile(x_tiled, X, Y, S) = A
		412	* where A = tile_num << 12 \| offset
		413	* tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
		414	* offset = (Y' & 0b111) << 9
		415	* \| (X & 0b111111111)
		416	* X' = X * cpp
		417	* Y' = Y + S * qpitch
		418	* detile(x_tiled, A) = (X, Y, S)
		419	* where X = X' / cpp
		420	* Y = Y' % qpitch
		421	* S = Y' / qpitch
		422	* Y' = (tile_num / tile_pitch) << 3
		423	* \| (A & 0b111000000000) >> 9
		424	* X' = (tile_num % tile_pitch) << 9
		425	* \| (A & 0b111111111)
		426	*
		427	* (In all tiling formulas, cpp is the number of bytes occupied by a single
		428	* sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
		429	* to fill the width of the surface, and qpitch is the spacing (in rows)
		430	* between array slices).
		431	*
		432	* For Y tiling, tile() combines together the low-order bits of the X and Y
		433	* coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
		434	* bytes wide and 32 rows high:
		435	*
		436	* tile(y_tiled, X, Y, S) = A
		437	* where A = tile_num << 12 \| offset
		438	* tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
		439	* offset = (X' & 0b1110000) << 5
		440	* \| (Y' & 0b11111) << 4
		441	* \| (X' & 0b1111)
		442	* X' = X * cpp
		443	* Y' = Y + S * qpitch
		444	* detile(y_tiled, A) = (X, Y, S)
		445	* where X = X' / cpp
		446	* Y = Y' % qpitch
		447	* S = Y' / qpitch
		448	* Y' = (tile_num / tile_pitch) << 5
		449	* \| (A & 0b111110000) >> 4
		450	* X' = (tile_num % tile_pitch) << 7
		451	* \| (A & 0b111000000000) >> 5
		452	* \| (A & 0b1111)
		453	*
		454	* For W tiling, tile() combines together the low-order bits of the X and Y
		455	* coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
		456	* bytes wide and 64 rows high (note that W tiling is only used for stencil
		457	* buffers, which always have cpp = 1 and S=0):
		458	*
		459	* tile(w_tiled, X, Y, S) = A
		460	* where A = tile_num << 12 \| offset
		461	* tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
		462	* offset = (X' & 0b111000) << 6
		463	* \| (Y' & 0b111100) << 3
		464	* \| (X' & 0b100) << 2
		465	* \| (Y' & 0b10) << 2
		466	* \| (X' & 0b10) << 1
		467	* \| (Y' & 0b1) << 1
		468	* \| (X' & 0b1)
		469	* X' = X * cpp = X
		470	* Y' = Y + S * qpitch
		471	* detile(w_tiled, A) = (X, Y, S)
		472	* where X = X' / cpp = X'
		473	* Y = Y' % qpitch = Y'
		474	* S = Y / qpitch = 0
		475	* Y' = (tile_num / tile_pitch) << 6
		476	* \| (A & 0b111100000) >> 3
		477	* \| (A & 0b1000) >> 2
		478	* \| (A & 0b10) >> 1
		479	* X' = (tile_num % tile_pitch) << 6
		480	* \| (A & 0b111000000000) >> 6
		481	* \| (A & 0b10000) >> 2
		482	* \| (A & 0b100) >> 1
		483	* \| (A & 0b1)
		484	*
		485	* Finally, for a non-tiled surface, tile() simply combines together the X and
		486	* Y coordinates in the natural way:
		487	*
		488	* tile(untiled, X, Y, S) = A
		489	* where A = Y * pitch + X'
		490	* X' = X * cpp
		491	* Y' = Y + S * qpitch
		492	* detile(untiled, A) = (X, Y, S)
		493	* where X = X' / cpp
		494	* Y = Y' % qpitch
		495	* S = Y' / qpitch
		496	* X' = A % pitch
		497	* Y' = A / pitch
		498	*
		499	* (In these formulas, pitch is the number of bytes occupied by a single row
		500	* of samples).
		501	*/
		502	class brw_blorp_blit_program : public brw_blorp_eu_emitter
		503	{
		504	public:
		505	brw_blorp_blit_program(struct brw_context *brw,
		506	const brw_blorp_blit_prog_key *key, bool debug_flag);
		507
		508	const GLuint compile(struct brw_context brw, GLuint *program_size);
		509
		510	brw_blorp_prog_data prog_data;
		511
		512	private:
		513	void alloc_regs();
		514	void alloc_push_const_regs(int base_reg);
		515	void compute_frag_coords();
		516	void translate_tiling(bool old_tiled_w, bool new_tiled_w);
		517	void encode_msaa(unsigned num_samples, intel_msaa_layout layout);
		518	void decode_msaa(unsigned num_samples, intel_msaa_layout layout);
		519	void translate_dst_to_src();
		520	void clamp_tex_coords(struct brw_reg regX, struct brw_reg regY,
		521	struct brw_reg clampX0, struct brw_reg clampY0,
		522	struct brw_reg clampX1, struct brw_reg clampY1);
		523	void single_to_blend();
		524	void manual_blend_average(unsigned num_samples);
		525	void manual_blend_bilinear(unsigned num_samples);
		526	void sample(struct brw_reg dst);
		527	void texel_fetch(struct brw_reg dst);
		528	void mcs_fetch();
		529	void texture_lookup(struct brw_reg dst, enum opcode op,
		530	const sampler_message_arg *args, int num_args);
		531	void render_target_write();
		532
		533	/**
		534	* Base-2 logarithm of the maximum number of samples that can be blended.
		535	*/
		536	static const unsigned LOG2_MAX_BLEND_SAMPLES = 3;
		537
		538	struct brw_context *brw;
		539	const brw_blorp_blit_prog_key *key;
		540
		541	/* Thread dispatch header */
		542	struct brw_reg R0;
		543
		544	/* Pixel X/Y coordinates (always in R1). */
		545	struct brw_reg R1;
		546
		547	/* Push constants */
		548	struct brw_reg dst_x0;
		549	struct brw_reg dst_x1;
		550	struct brw_reg dst_y0;
		551	struct brw_reg dst_y1;
		552	/* Top right coordinates of the rectangular grid used for scaled blitting */
		553	struct brw_reg rect_grid_x1;
		554	struct brw_reg rect_grid_y1;
		555	struct {
		556	struct brw_reg multiplier;
		557	struct brw_reg offset;
		558	} x_transform, y_transform;
		559
		560	/* Data read from texture (4 vec16's per array element) */
		561	struct brw_reg texture_data[LOG2_MAX_BLEND_SAMPLES + 1];
		562
		563	/* Auxiliary storage for the contents of the MCS surface.
		564	*
		565	* Since the sampler always returns 8 registers worth of data, this is 8
		566	* registers wide, even though we only use the first 2 registers of it.
		567	*/
		568	struct brw_reg mcs_data;
		569
		570	/* X coordinates. We have two of them so that we can perform coordinate
		571	* transformations easily.
		572	*/
		573	struct brw_reg x_coords[2];
		574
		575	/* Y coordinates. We have two of them so that we can perform coordinate
		576	* transformations easily.
		577	*/
		578	struct brw_reg y_coords[2];
		579
		580	/* X, Y coordinates of the pixel from which we need to fetch the specific
		581	* sample. These are used for multisample scaled blitting.
		582	*/
		583	struct brw_reg x_sample_coords;
		584	struct brw_reg y_sample_coords;
		585
		586	/* Fractional parts of the x and y coordinates, used as bilinear interpolation coefficients */
		587	struct brw_reg x_frac;
		588	struct brw_reg y_frac;
		589
		590	/* Which element of x_coords and y_coords is currently in use.
		591	*/
		592	int xy_coord_index;
		593
		594	/* True if, at the point in the program currently being compiled, the
		595	* sample index is known to be zero.
		596	*/
		597	bool s_is_zero;
		598
		599	/* Register storing the sample index when s_is_zero is false. */
		600	struct brw_reg sample_index;
		601
		602	/* Temporaries */
		603	struct brw_reg t1;
		604	struct brw_reg t2;
		605
		606	/* MRF used for sampling and render target writes */
		607	GLuint base_mrf;
		608	};
		609
		610	brw_blorp_blit_program::brw_blorp_blit_program(
		611	struct brw_context *brw,
		612	const brw_blorp_blit_prog_key *key,
		613	bool debug_flag)
		614	: brw_blorp_eu_emitter(brw, debug_flag),
		615	brw(brw),
		616	key(key)
		617	{
		618	}
		619
		620	const GLuint *
		621	brw_blorp_blit_program::compile(struct brw_context *brw,
		622	GLuint *program_size)
		623	{
		624	/* Sanity checks */
		625	if (key->dst_tiled_w && key->rt_samples > 0) {
		626	/* If the destination image is W tiled and multisampled, then the thread
		627	* must be dispatched once per sample, not once per pixel. This is
		628	* necessary because after conversion between W and Y tiling, there's no
		629	* guarantee that all samples corresponding to a single pixel will still
		630	* be together.
		631	*/
		632	assert(key->persample_msaa_dispatch);
		633	}
		634
		635	if (key->blend) {
		636	/* We are blending, which means we won't have an opportunity to
		637	* translate the tiling and sample count for the texture surface. So
		638	* the surface state for the texture must be configured with the correct
		639	* tiling and sample count.
		640	*/
		641	assert(!key->src_tiled_w);
		642	assert(key->tex_samples == key->src_samples);
		643	assert(key->tex_layout == key->src_layout);
		644	assert(key->tex_samples > 0);
		645	}
		646
		647	if (key->persample_msaa_dispatch) {
		648	/* It only makes sense to do persample dispatch if the render target is
		649	* configured as multisampled.
		650	*/
		651	assert(key->rt_samples > 0);
		652	}
		653
		654	/* Make sure layout is consistent with sample count */
		655	assert((key->tex_layout == INTEL_MSAA_LAYOUT_NONE) ==
		656	(key->tex_samples == 0));
		657	assert((key->rt_layout == INTEL_MSAA_LAYOUT_NONE) ==
		658	(key->rt_samples == 0));
		659	assert((key->src_layout == INTEL_MSAA_LAYOUT_NONE) ==
		660	(key->src_samples == 0));
		661	assert((key->dst_layout == INTEL_MSAA_LAYOUT_NONE) ==
		662	(key->dst_samples == 0));
		663
		664	/* Set up prog_data */
		665	memset(&prog_data, 0, sizeof(prog_data));
		666	prog_data.persample_msaa_dispatch = key->persample_msaa_dispatch;
		667
		668	alloc_regs();
		669	compute_frag_coords();
		670
		671	/* Render target and texture hardware don't support W tiling. */
		672	const bool rt_tiled_w = false;
		673	const bool tex_tiled_w = false;
		674
		675	/* The address that data will be written to is determined by the
		676	* coordinates supplied to the WM thread and the tiling and sample count of
		677	* the render target, according to the formula:
		678	*
		679	* (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
		680	*
		681	* If the actual tiling and sample count of the destination surface are not
		682	* the same as the configuration of the render target, then these
		683	* coordinates are wrong and we have to adjust them to compensate for the
		684	* difference.
		685	*/
		686	if (rt_tiled_w != key->dst_tiled_w \|\|
		687	key->rt_samples != key->dst_samples \|\|
		688	key->rt_layout != key->dst_layout) {
		689	encode_msaa(key->rt_samples, key->rt_layout);
		690	/* Now (X, Y, S) = detile(rt_tiling, offset) */
		691	translate_tiling(rt_tiled_w, key->dst_tiled_w);
		692	/* Now (X, Y, S) = detile(dst_tiling, offset) */
		693	decode_msaa(key->dst_samples, key->dst_layout);
		694	}
		695
		696	/* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
		697	*
		698	* That is: X, Y and S now contain the true coordinates and sample index of
		699	* the data that the WM thread should output.
		700	*
		701	* If we need to kill pixels that are outside the destination rectangle,
		702	* now is the time to do it.
		703	*/
		704
		705	if (key->use_kill)
		706	emit_kill_if_outside_rect(x_coords[xy_coord_index],
		707	y_coords[xy_coord_index],
		708	dst_x0, dst_x1, dst_y0, dst_y1);
		709
		710	/* Next, apply a translation to obtain coordinates in the source image. */
		711	translate_dst_to_src();
		712
		713	/* If the source image is not multisampled, then we want to fetch sample
		714	* number 0, because that's the only sample there is.
		715	*/
		716	if (key->src_samples == 0)
		717	s_is_zero = true;
		718
		719	/* X, Y, and S are now the coordinates of the pixel in the source image
		720	* that we want to texture from. Exception: if we are blending, then S is
		721	* irrelevant, because we are going to fetch all samples.
		722	*/
		723	if (key->blend && !key->blit_scaled) {
		724	if (brw->gen == 6) {
		725	/* Gen6 hardware an automatically blend using the SAMPLE message */
		726	single_to_blend();
		727	sample(texture_data[0]);
		728	} else {
		729	/* Gen7+ hardware doesn't automaticaly blend. */
		730	manual_blend_average(key->src_samples);
		731	}
		732	} else if(key->blend && key->blit_scaled) {
		733	manual_blend_bilinear(key->src_samples);
		734	} else {
		735	/* We aren't blending, which means we just want to fetch a single sample
		736	* from the source surface. The address that we want to fetch from is
		737	* related to the X, Y and S values according to the formula:
		738	*
		739	* (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
		740	*
		741	* If the actual tiling and sample count of the source surface are not
		742	* the same as the configuration of the texture, then we need to adjust
		743	* the coordinates to compensate for the difference.
		744	*/
		745	if ((tex_tiled_w != key->src_tiled_w \|\|
		746	key->tex_samples != key->src_samples \|\|
		747	key->tex_layout != key->src_layout) &&
		748	!key->bilinear_filter) {
		749	encode_msaa(key->src_samples, key->src_layout);
		750	/* Now (X, Y, S) = detile(src_tiling, offset) */
		751	translate_tiling(key->src_tiled_w, tex_tiled_w);
		752	/* Now (X, Y, S) = detile(tex_tiling, offset) */
		753	decode_msaa(key->tex_samples, key->tex_layout);
		754	}
		755
		756	if (key->bilinear_filter) {
		757	sample(texture_data[0]);
		758	}
		759	else {
		760	/* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
		761	*
		762	* In other words: X, Y, and S now contain values which, when passed to
		763	* the texturing unit, will cause data to be read from the correct
		764	* memory location. So we can fetch the texel now.
		765	*/
		766	if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
		767	mcs_fetch();
		768	texel_fetch(texture_data[0]);
		769	}
		770	}
		771
		772	/* Finally, write the fetched (or blended) value to the render target and
		773	* terminate the thread.
		774	*/
		775	render_target_write();
		776
		777	return get_program(program_size);
		778	}
		779
		780	void
		781	brw_blorp_blit_program::alloc_push_const_regs(int base_reg)
		782	{
		783	#define CONST_LOC(name) offsetof(brw_blorp_wm_push_constants, name)
		784	#define ALLOC_REG(name, type) \
		785	this->name = \
		786	retype(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, \
		787	base_reg + CONST_LOC(name) / 32, \
		788	(CONST_LOC(name) % 32) / 4), type)
		789
		790	ALLOC_REG(dst_x0, BRW_REGISTER_TYPE_UD);
		791	ALLOC_REG(dst_x1, BRW_REGISTER_TYPE_UD);
		792	ALLOC_REG(dst_y0, BRW_REGISTER_TYPE_UD);
		793	ALLOC_REG(dst_y1, BRW_REGISTER_TYPE_UD);
		794	ALLOC_REG(rect_grid_x1, BRW_REGISTER_TYPE_F);
		795	ALLOC_REG(rect_grid_y1, BRW_REGISTER_TYPE_F);
		796	ALLOC_REG(x_transform.multiplier, BRW_REGISTER_TYPE_F);
		797	ALLOC_REG(x_transform.offset, BRW_REGISTER_TYPE_F);
		798	ALLOC_REG(y_transform.multiplier, BRW_REGISTER_TYPE_F);
		799	ALLOC_REG(y_transform.offset, BRW_REGISTER_TYPE_F);
		800	#undef CONST_LOC
		801	#undef ALLOC_REG
		802	}
		803
		804	void
		805	brw_blorp_blit_program::alloc_regs()
		806	{
		807	int reg = 0;
		808	this->R0 = retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW);
		809	this->R1 = retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW);
		810	prog_data.first_curbe_grf = reg;
		811	alloc_push_const_regs(reg);
		812	reg += BRW_BLORP_NUM_PUSH_CONST_REGS;
		813	for (unsigned i = 0; i < ARRAY_SIZE(texture_data); ++i) {
		814	this->texture_data[i] =
		815	retype(vec16(brw_vec8_grf(reg, 0)), key->texture_data_type);
		816	reg += 8;
		817	}
		818	this->mcs_data =
		819	retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD); reg += 8;
		820
		821	for (int i = 0; i < 2; ++i) {
		822	this->x_coords[i]
		823	= retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD);
		824	reg += 2;
		825	this->y_coords[i]
		826	= retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD);
		827	reg += 2;
		828	}
		829
		830	if (key->blit_scaled && key->blend) {
		831	this->x_sample_coords = brw_vec8_grf(reg, 0);
		832	reg += 2;
		833	this->y_sample_coords = brw_vec8_grf(reg, 0);
		834	reg += 2;
		835	this->x_frac = brw_vec8_grf(reg, 0);
		836	reg += 2;
		837	this->y_frac = brw_vec8_grf(reg, 0);
		838	reg += 2;
		839	}
		840
		841	this->xy_coord_index = 0;
		842	this->sample_index
		843	= retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD);
		844	reg += 2;
		845	this->t1 = retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD);
		846	reg += 2;
		847	this->t2 = retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD);
		848	reg += 2;
		849
		850	/* Make sure we didn't run out of registers */
		851	assert(reg <= GEN7_MRF_HACK_START);
		852
		853	int mrf = 2;
		854	this->base_mrf = mrf;
		855	}
		856
		857	/* In the code that follows, X and Y can be used to quickly refer to the
		858	* active elements of x_coords and y_coords, and Xp and Yp ("X prime" and "Y
		859	* prime") to the inactive elements.
		860	*
		861	* S can be used to quickly refer to sample_index.
		862	*/
		863	#define X x_coords[xy_coord_index]
		864	#define Y y_coords[xy_coord_index]
		865	#define Xp x_coords[!xy_coord_index]
		866	#define Yp y_coords[!xy_coord_index]
		867	#define S sample_index
		868
		869	/* Quickly swap the roles of (X, Y) and (Xp, Yp). Saves us from having to do
		870	* MOVs to transfor (Xp, Yp) to (X, Y) after a coordinate transformation.
		871	*/
		872	#define SWAP_XY_AND_XPYP() xy_coord_index = !xy_coord_index;
		873
		874	/**
		875	* Emit code to compute the X and Y coordinates of the pixels being rendered
		876	* by this WM invocation.
		877	*
		878	* Assuming the render target is set up for Y tiling, these (X, Y) values are
		879	* related to the address offset where outputs will be written by the formula:
		880	*
		881	* (X, Y, S) = decode_msaa(detile(offset)).
		882	*
		883	* (See brw_blorp_blit_program).
		884	*/
		885	void
		886	brw_blorp_blit_program::compute_frag_coords()
		887	{
		888	/* R1.2[15:0] = X coordinate of upper left pixel of subspan 0 (pixel 0)
		889	* R1.3[15:0] = X coordinate of upper left pixel of subspan 1 (pixel 4)
		890	* R1.4[15:0] = X coordinate of upper left pixel of subspan 2 (pixel 8)
		891	* R1.5[15:0] = X coordinate of upper left pixel of subspan 3 (pixel 12)
		892	*
		893	* Pixels within a subspan are laid out in this arrangement:
		894	* 0 1
		895	* 2 3
		896	*
		897	* So, to compute the coordinates of each pixel, we need to read every 2nd
		898	* 16-bit value (vstride=2) from R1, starting at the 4th 16-bit value
		899	* (suboffset=4), and duplicate each value 4 times (hstride=0, width=4).
		900	* In other words, the data we want to access is R1.4<2;4,0>UW.
		901	*
		902	* Then, we need to add the repeating sequence (0, 1, 0, 1, ...) to the
		903	* result, since pixels n+1 and n+3 are in the right half of the subspan.
		904	*/
		905	emit_add(vec16(retype(X, BRW_REGISTER_TYPE_UW)),
		906	stride(suboffset(R1, 4), 2, 4, 0), brw_imm_v(0x10101010));
		907
		908	/* Similarly, Y coordinates for subspans come from R1.2[31:16] through
		909	* R1.5[31:16], so to get pixel Y coordinates we need to start at the 5th
		910	* 16-bit value instead of the 4th (R1.5<2;4,0>UW instead of
		911	* R1.4<2;4,0>UW).
		912	*
		913	* And we need to add the repeating sequence (0, 0, 1, 1, ...), since
		914	* pixels n+2 and n+3 are in the bottom half of the subspan.
		915	*/
		916	emit_add(vec16(retype(Y, BRW_REGISTER_TYPE_UW)),
		917	stride(suboffset(R1, 5), 2, 4, 0), brw_imm_v(0x11001100));
		918
		919	/* Move the coordinates to UD registers. */
		920	emit_mov(vec16(Xp), retype(X, BRW_REGISTER_TYPE_UW));
		921	emit_mov(vec16(Yp), retype(Y, BRW_REGISTER_TYPE_UW));
		922	SWAP_XY_AND_XPYP();
		923
		924	if (key->persample_msaa_dispatch) {
		925	switch (key->rt_samples) {
		926	case 4: {
		927	/* The WM will be run in MSDISPMODE_PERSAMPLE with num_samples == 4.
		928	* Therefore, subspan 0 will represent sample 0, subspan 1 will
		929	* represent sample 1, and so on.
		930	*
		931	* So we need to populate S with the sequence (0, 0, 0, 0, 1, 1, 1,
		932	* 1, 2, 2, 2, 2, 3, 3, 3, 3). The easiest way to do this is to
		933	* populate a temporary variable with the sequence (0, 1, 2, 3), and
		934	* then copy from it using vstride=1, width=4, hstride=0.
		935	*/
		936	struct brw_reg t1_uw1 = retype(t1, BRW_REGISTER_TYPE_UW);
		937	emit_mov(vec16(t1_uw1), brw_imm_v(0x3210));
		938	/* Move to UD sample_index register. */
		939	emit_mov_8(S, stride(t1_uw1, 1, 4, 0));
		940	emit_mov_8(offset(S, 1), suboffset(stride(t1_uw1, 1, 4, 0), 2));
		941	break;
		942	}
		943	case 8: {
		944	/* The WM will be run in MSDISPMODE_PERSAMPLE with num_samples == 8.
		945	* Therefore, subspan 0 will represent sample N (where N is 0 or 4),
		946	* subspan 1 will represent sample 1, and so on. We can find the
		947	* value of N by looking at R0.0 bits 7:6 ("Starting Sample Pair
		948	* Index") and multiplying by two (since samples are always delivered
		949	* in pairs). That is, we compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 &
		950	* 0xc0) >> 5.
		951	*
		952	* Then we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1, 2,
		953	* 2, 2, 2, 3, 3, 3, 3), which we compute by populating a temporary
		954	* variable with the sequence (0, 1, 2, 3), and then reading from it
		955	* using vstride=1, width=4, hstride=0.
		956	*/
		957	struct brw_reg t1_ud1 = vec1(retype(t1, BRW_REGISTER_TYPE_UD));
		958	struct brw_reg t2_uw1 = retype(t2, BRW_REGISTER_TYPE_UW);
		959	struct brw_reg r0_ud1 = vec1(retype(R0, BRW_REGISTER_TYPE_UD));
		960	emit_and(t1_ud1, r0_ud1, brw_imm_ud(0xc0));
		961	emit_shr(t1_ud1, t1_ud1, brw_imm_ud(5));
		962	emit_mov(vec16(t2_uw1), brw_imm_v(0x3210));
		963	emit_add(vec16(S), retype(t1_ud1, BRW_REGISTER_TYPE_UW),
		964	stride(t2_uw1, 1, 4, 0));
		965	emit_add_8(offset(S, 1),
		966	retype(t1_ud1, BRW_REGISTER_TYPE_UW),
		967	suboffset(stride(t2_uw1, 1, 4, 0), 2));
		968	break;
		969	}
		970	default:
		971	unreachable("Unrecognized sample count in "
		972	"brw_blorp_blit_program::compute_frag_coords()");
		973	}
		974	s_is_zero = false;
		975	} else {
		976	/* Either the destination surface is single-sampled, or the WM will be
		977	* run in MSDISPMODE_PERPIXEL (which causes a single fragment dispatch
		978	* per pixel). In either case, it's not meaningful to compute a sample
		979	* value. Just set it to 0.
		980	*/
		981	s_is_zero = true;
		982	}
		983	}
		984
		985	/**
		986	* Emit code to compensate for the difference between Y and W tiling.
		987	*
		988	* This code modifies the X and Y coordinates according to the formula:
		989	*
		990	* (X', Y', S') = detile(new_tiling, tile(old_tiling, X, Y, S))
		991	*
		992	* (See brw_blorp_blit_program).
		993	*
		994	* It can only translate between W and Y tiling, so new_tiling and old_tiling
		995	* are booleans where true represents W tiling and false represents Y tiling.
		996	*/
		997	void
		998	brw_blorp_blit_program::translate_tiling(bool old_tiled_w, bool new_tiled_w)
		999	{
		1000	if (old_tiled_w == new_tiled_w)
		1001	return;
		1002
		1003	/* In the code that follows, we can safely assume that S = 0, because W
		1004	* tiling formats always use IMS layout.
		1005	*/
		1006	assert(s_is_zero);
		1007
		1008	if (new_tiled_w) {
		1009	/* Given X and Y coordinates that describe an address using Y tiling,
		1010	* translate to the X and Y coordinates that describe the same address
		1011	* using W tiling.
		1012	*
		1013	* If we break down the low order bits of X and Y, using a
		1014	* single letter to represent each low-order bit:
		1015	*
		1016	* X = A << 7 \| 0bBCDEFGH
		1017	* Y = J << 5 \| 0bKLMNP (1)
		1018	*
		1019	* Then we can apply the Y tiling formula to see the memory offset being
		1020	* addressed:
		1021	*
		1022	* offset = (J * tile_pitch + A) << 12 \| 0bBCDKLMNPEFGH (2)
		1023	*
		1024	* If we apply the W detiling formula to this memory location, that the
		1025	* corresponding X' and Y' coordinates are:
		1026	*
		1027	* X' = A << 6 \| 0bBCDPFH (3)
		1028	* Y' = J << 6 \| 0bKLMNEG
		1029	*
		1030	* Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
		1031	* we need to make the following computation:
		1032	*
		1033	* X' = (X & ~0b1011) >> 1 \| (Y & 0b1) << 2 \| X & 0b1 (4)
		1034	* Y' = (Y & ~0b1) << 1 \| (X & 0b1000) >> 2 \| (X & 0b10) >> 1
		1035	*/
		1036	emit_and(t1, X, brw_imm_uw(0xfff4)); /* X & ~0b1011 */
		1037	emit_shr(t1, t1, brw_imm_uw(1)); /* (X & ~0b1011) >> 1 */
		1038	emit_and(t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
		1039	emit_shl(t2, t2, brw_imm_uw(2)); /* (Y & 0b1) << 2 */
		1040	emit_or(t1, t1, t2); /* (X & ~0b1011) >> 1 \| (Y & 0b1) << 2 */
		1041	emit_and(t2, X, brw_imm_uw(1)); /* X & 0b1 */
		1042	emit_or(Xp, t1, t2);
		1043	emit_and(t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
		1044	emit_shl(t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
		1045	emit_and(t2, X, brw_imm_uw(8)); /* X & 0b1000 */
		1046	emit_shr(t2, t2, brw_imm_uw(2)); /* (X & 0b1000) >> 2 */
		1047	emit_or(t1, t1, t2); /* (Y & ~0b1) << 1 \| (X & 0b1000) >> 2 */
		1048	emit_and(t2, X, brw_imm_uw(2)); /* X & 0b10 */
		1049	emit_shr(t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
		1050	emit_or(Yp, t1, t2);
		1051	SWAP_XY_AND_XPYP();
		1052	} else {
		1053	/* Applying the same logic as above, but in reverse, we obtain the
		1054	* formulas:
		1055	*
		1056	* X' = (X & ~0b101) << 1 \| (Y & 0b10) << 2 \| (Y & 0b1) << 1 \| X & 0b1
		1057	* Y' = (Y & ~0b11) >> 1 \| (X & 0b100) >> 2
		1058	*/
		1059	emit_and(t1, X, brw_imm_uw(0xfffa)); /* X & ~0b101 */
		1060	emit_shl(t1, t1, brw_imm_uw(1)); /* (X & ~0b101) << 1 */
		1061	emit_and(t2, Y, brw_imm_uw(2)); /* Y & 0b10 */
		1062	emit_shl(t2, t2, brw_imm_uw(2)); /* (Y & 0b10) << 2 */
		1063	emit_or(t1, t1, t2); /* (X & ~0b101) << 1 \| (Y & 0b10) << 2 */
		1064	emit_and(t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
		1065	emit_shl(t2, t2, brw_imm_uw(1)); /* (Y & 0b1) << 1 */
		1066	emit_or(t1, t1, t2); /* (X & ~0b101) << 1 \| (Y & 0b10) << 2
		1067	\| (Y & 0b1) << 1 */
		1068	emit_and(t2, X, brw_imm_uw(1)); /* X & 0b1 */
		1069	emit_or(Xp, t1, t2);
		1070	emit_and(t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
		1071	emit_shr(t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
		1072	emit_and(t2, X, brw_imm_uw(4)); /* X & 0b100 */
		1073	emit_shr(t2, t2, brw_imm_uw(2)); /* (X & 0b100) >> 2 */
		1074	emit_or(Yp, t1, t2);
		1075	SWAP_XY_AND_XPYP();
		1076	}
		1077	}
		1078
		1079	/**
		1080	* Emit code to compensate for the difference between MSAA and non-MSAA
		1081	* surfaces.
		1082	*
		1083	* This code modifies the X and Y coordinates according to the formula:
		1084	*
		1085	* (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
		1086	*
		1087	* (See brw_blorp_blit_program).
		1088	*/
		1089	void
		1090	brw_blorp_blit_program::encode_msaa(unsigned num_samples,
		1091	intel_msaa_layout layout)
		1092	{
		1093	switch (layout) {
		1094	case INTEL_MSAA_LAYOUT_NONE:
		1095	/* No translation necessary, and S should already be zero. */
		1096	assert(s_is_zero);
		1097	break;
		1098	case INTEL_MSAA_LAYOUT_CMS:
		1099	/* We can't compensate for compressed layout since at this point in the
		1100	* program we haven't read from the MCS buffer.
		1101	*/
		1102	unreachable("Bad layout in encode_msaa");
		1103	case INTEL_MSAA_LAYOUT_UMS:
		1104	/* No translation necessary. */
		1105	break;
		1106	case INTEL_MSAA_LAYOUT_IMS:
		1107	switch (num_samples) {
		1108	case 4:
		1109	/* encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
		1110	* where X' = (X & ~0b1) << 1 \| (S & 0b1) << 1 \| (X & 0b1)
		1111	* Y' = (Y & ~0b1) << 1 \| (S & 0b10) \| (Y & 0b1)
		1112	*/
		1113	emit_and(t1, X, brw_imm_uw(0xfffe)); /* X & ~0b1 */
		1114	if (!s_is_zero) {
		1115	emit_and(t2, S, brw_imm_uw(1)); /* S & 0b1 */
		1116	emit_or(t1, t1, t2); /* (X & ~0b1) \| (S & 0b1) */
		1117	}
		1118	emit_shl(t1, t1, brw_imm_uw(1)); /* (X & ~0b1) << 1
		1119	\| (S & 0b1) << 1 */
		1120	emit_and(t2, X, brw_imm_uw(1)); /* X & 0b1 */
		1121	emit_or(Xp, t1, t2);
		1122	emit_and(t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
		1123	emit_shl(t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
		1124	if (!s_is_zero) {
		1125	emit_and(t2, S, brw_imm_uw(2)); /* S & 0b10 */
		1126	emit_or(t1, t1, t2); /* (Y & ~0b1) << 1 \| (S & 0b10) */
		1127	}
		1128	emit_and(t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
		1129	emit_or(Yp, t1, t2);
		1130	break;
		1131	case 8:
		1132	/* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
		1133	* where X' = (X & ~0b1) << 2 \| (S & 0b100) \| (S & 0b1) << 1
		1134	* \| (X & 0b1)
		1135	* Y' = (Y & ~0b1) << 1 \| (S & 0b10) \| (Y & 0b1)
		1136	*/
		1137	emit_and(t1, X, brw_imm_uw(0xfffe)); /* X & ~0b1 */
		1138	emit_shl(t1, t1, brw_imm_uw(2)); /* (X & ~0b1) << 2 */
		1139	if (!s_is_zero) {
		1140	emit_and(t2, S, brw_imm_uw(4)); /* S & 0b100 */
		1141	emit_or(t1, t1, t2); /* (X & ~0b1) << 2 \| (S & 0b100) */
		1142	emit_and(t2, S, brw_imm_uw(1)); /* S & 0b1 */
		1143	emit_shl(t2, t2, brw_imm_uw(1)); /* (S & 0b1) << 1 */
		1144	emit_or(t1, t1, t2); /* (X & ~0b1) << 2 \| (S & 0b100)
		1145	\| (S & 0b1) << 1 */
		1146	}
		1147	emit_and(t2, X, brw_imm_uw(1)); /* X & 0b1 */
		1148	emit_or(Xp, t1, t2);
		1149	emit_and(t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
		1150	emit_shl(t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
		1151	if (!s_is_zero) {
		1152	emit_and(t2, S, brw_imm_uw(2)); /* S & 0b10 */
		1153	emit_or(t1, t1, t2); /* (Y & ~0b1) << 1 \| (S & 0b10) */
		1154	}
		1155	emit_and(t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
		1156	emit_or(Yp, t1, t2);
		1157	break;
		1158	}
		1159	SWAP_XY_AND_XPYP();
		1160	s_is_zero = true;
		1161	break;
		1162	}
		1163	}
		1164
		1165	/**
		1166	* Emit code to compensate for the difference between MSAA and non-MSAA
		1167	* surfaces.
		1168	*
		1169	* This code modifies the X and Y coordinates according to the formula:
		1170	*
		1171	* (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
		1172	*
		1173	* (See brw_blorp_blit_program).
		1174	*/
		1175	void
		1176	brw_blorp_blit_program::decode_msaa(unsigned num_samples,
		1177	intel_msaa_layout layout)
		1178	{
		1179	switch (layout) {
		1180	case INTEL_MSAA_LAYOUT_NONE:
		1181	/* No translation necessary, and S should already be zero. */
		1182	assert(s_is_zero);
		1183	break;
		1184	case INTEL_MSAA_LAYOUT_CMS:
		1185	/* We can't compensate for compressed layout since at this point in the
		1186	* program we don't have access to the MCS buffer.
		1187	*/
		1188	unreachable("Bad layout in encode_msaa");
		1189	case INTEL_MSAA_LAYOUT_UMS:
		1190	/* No translation necessary. */
		1191	break;
		1192	case INTEL_MSAA_LAYOUT_IMS:
		1193	assert(s_is_zero);
		1194	switch (num_samples) {
		1195	case 4:
		1196	/* decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
		1197	* where X' = (X & ~0b11) >> 1 \| (X & 0b1)
		1198	* Y' = (Y & ~0b11) >> 1 \| (Y & 0b1)
		1199	* S = (Y & 0b10) \| (X & 0b10) >> 1
		1200	*/
		1201	emit_and(t1, X, brw_imm_uw(0xfffc)); /* X & ~0b11 */
		1202	emit_shr(t1, t1, brw_imm_uw(1)); /* (X & ~0b11) >> 1 */
		1203	emit_and(t2, X, brw_imm_uw(1)); /* X & 0b1 */
		1204	emit_or(Xp, t1, t2);
		1205	emit_and(t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
		1206	emit_shr(t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
		1207	emit_and(t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
		1208	emit_or(Yp, t1, t2);
		1209	emit_and(t1, Y, brw_imm_uw(2)); /* Y & 0b10 */
		1210	emit_and(t2, X, brw_imm_uw(2)); /* X & 0b10 */
		1211	emit_shr(t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
		1212	emit_or(S, t1, t2);
		1213	break;
		1214	case 8:
		1215	/* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
		1216	* where X' = (X & ~0b111) >> 2 \| (X & 0b1)
		1217	* Y' = (Y & ~0b11) >> 1 \| (Y & 0b1)
		1218	* S = (X & 0b100) \| (Y & 0b10) \| (X & 0b10) >> 1
		1219	*/
		1220	emit_and(t1, X, brw_imm_uw(0xfff8)); /* X & ~0b111 */
		1221	emit_shr(t1, t1, brw_imm_uw(2)); /* (X & ~0b111) >> 2 */
		1222	emit_and(t2, X, brw_imm_uw(1)); /* X & 0b1 */
		1223	emit_or(Xp, t1, t2);
		1224	emit_and(t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
		1225	emit_shr(t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
		1226	emit_and(t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
		1227	emit_or(Yp, t1, t2);
		1228	emit_and(t1, X, brw_imm_uw(4)); /* X & 0b100 */
		1229	emit_and(t2, Y, brw_imm_uw(2)); /* Y & 0b10 */
		1230	emit_or(t1, t1, t2); /* (X & 0b100) \| (Y & 0b10) */
		1231	emit_and(t2, X, brw_imm_uw(2)); /* X & 0b10 */
		1232	emit_shr(t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
		1233	emit_or(S, t1, t2);
		1234	break;
		1235	}
		1236	s_is_zero = false;
		1237	SWAP_XY_AND_XPYP();
		1238	break;
		1239	}
		1240	}
		1241
		1242	/**
		1243	* Emit code to translate from destination (X, Y) coordinates to source (X, Y)
		1244	* coordinates.
		1245	*/
		1246	void
		1247	brw_blorp_blit_program::translate_dst_to_src()
		1248	{
		1249	struct brw_reg X_f = retype(X, BRW_REGISTER_TYPE_F);
		1250	struct brw_reg Y_f = retype(Y, BRW_REGISTER_TYPE_F);
		1251	struct brw_reg Xp_f = retype(Xp, BRW_REGISTER_TYPE_F);
		1252	struct brw_reg Yp_f = retype(Yp, BRW_REGISTER_TYPE_F);
		1253
		1254	/* Move the UD coordinates to float registers. */
		1255	emit_mov(Xp_f, X);
		1256	emit_mov(Yp_f, Y);
		1257	/* Scale and offset */
		1258	emit_mad(X_f, x_transform.offset, Xp_f, x_transform.multiplier);
		1259	emit_mad(Y_f, y_transform.offset, Yp_f, y_transform.multiplier);
		1260	if (key->blit_scaled && key->blend) {
		1261	/* Translate coordinates to lay out the samples in a rectangular grid
		1262	* roughly corresponding to sample locations.
		1263	*/
		1264	emit_mul(X_f, X_f, brw_imm_f(key->x_scale));
		1265	emit_mul(Y_f, Y_f, brw_imm_f(key->y_scale));
		1266	/* Adjust coordinates so that integers represent pixel centers rather
		1267	* than pixel edges.
		1268	*/
		1269	emit_add(X_f, X_f, brw_imm_f(-0.5));
		1270	emit_add(Y_f, Y_f, brw_imm_f(-0.5));
		1271
		1272	/* Clamp the X, Y texture coordinates to properly handle the sampling of
		1273	* texels on texture edges.
		1274	*/
		1275	clamp_tex_coords(X_f, Y_f,
		1276	brw_imm_f(0.0), brw_imm_f(0.0),
		1277	rect_grid_x1, rect_grid_y1);
		1278
		1279	/* Store the fractional parts to be used as bilinear interpolation
		1280	* coefficients.
		1281	*/
		1282	emit_frc(x_frac, X_f);
		1283	emit_frc(y_frac, Y_f);
		1284
		1285	/* Round the float coordinates down to nearest integer */
		1286	emit_rndd(Xp_f, X_f);
		1287	emit_rndd(Yp_f, Y_f);
		1288	emit_mul(X_f, Xp_f, brw_imm_f(1 / key->x_scale));
		1289	emit_mul(Y_f, Yp_f, brw_imm_f(1 / key->y_scale));
		1290	SWAP_XY_AND_XPYP();
		1291	} else if (!key->bilinear_filter) {
		1292	/* Round the float coordinates down to nearest integer by moving to
		1293	* UD registers.
		1294	*/
		1295	emit_mov(Xp, X_f);
		1296	emit_mov(Yp, Y_f);
		1297	SWAP_XY_AND_XPYP();
		1298	}
		1299	}
		1300
		1301	void
		1302	brw_blorp_blit_program::clamp_tex_coords(struct brw_reg regX,
		1303	struct brw_reg regY,
		1304	struct brw_reg clampX0,
		1305	struct brw_reg clampY0,
		1306	struct brw_reg clampX1,
		1307	struct brw_reg clampY1)
		1308	{
		1309	emit_max(regX, regX, clampX0);
		1310	emit_max(regY, regY, clampY0);
		1311	emit_min(regX, regX, clampX1);
		1312	emit_min(regY, regY, clampY1);
		1313	}
		1314
		1315	/**
		1316	* Emit code to transform the X and Y coordinates as needed for blending
		1317	* together the different samples in an MSAA texture.
		1318	*/
		1319	void
		1320	brw_blorp_blit_program::single_to_blend()
		1321	{
		1322	/* When looking up samples in an MSAA texture using the SAMPLE message,
		1323	* Gen6 requires the texture coordinates to be odd integers (so that they
		1324	* correspond to the center of a 2x2 block representing the four samples
		1325	* that maxe up a pixel). So we need to multiply our X and Y coordinates
		1326	* each by 2 and then add 1.
		1327	*/
		1328	emit_shl(t1, X, brw_imm_w(1));
		1329	emit_shl(t2, Y, brw_imm_w(1));
		1330	emit_add(Xp, t1, brw_imm_w(1));
		1331	emit_add(Yp, t2, brw_imm_w(1));
		1332	SWAP_XY_AND_XPYP();
		1333	}
		1334
		1335
		1336	/**
		1337	* Count the number of trailing 1 bits in the given value. For example:
		1338	*
		1339	* count_trailing_one_bits(0) == 0
		1340	* count_trailing_one_bits(7) == 3
		1341	* count_trailing_one_bits(11) == 2
		1342	*/
		1343	inline int count_trailing_one_bits(unsigned value)
		1344	{
		1345	#ifdef HAVE___BUILTIN_CTZ
		1346	return __builtin_ctz(~value);
		1347	#else
		1348	return _mesa_bitcount(value & ~(value + 1));
		1349	#endif
		1350	}
		1351
		1352
		1353	void
		1354	brw_blorp_blit_program::manual_blend_average(unsigned num_samples)
		1355	{
		1356	if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
		1357	mcs_fetch();
		1358
		1359	/* We add together samples using a binary tree structure, e.g. for 4x MSAA:
		1360	*
		1361	* result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
		1362	*
		1363	* This ensures that when all samples have the same value, no numerical
		1364	* precision is lost, since each addition operation always adds two equal
		1365	* values, and summing two equal floating point values does not lose
		1366	* precision.
		1367	*
		1368	* We perform this computation by treating the texture_data array as a
		1369	* stack and performing the following operations:
		1370	*
		1371	* - push sample 0 onto stack
		1372	* - push sample 1 onto stack
		1373	* - add top two stack entries
		1374	* - push sample 2 onto stack
		1375	* - push sample 3 onto stack
		1376	* - add top two stack entries
		1377	* - add top two stack entries
		1378	* - divide top stack entry by 4
		1379	*
		1380	* Note that after pushing sample i onto the stack, the number of add
		1381	* operations we do is equal to the number of trailing 1 bits in i. This
		1382	* works provided the total number of samples is a power of two, which it
		1383	* always is for i965.
		1384	*
		1385	* For integer formats, we replace the add operations with average
		1386	* operations and skip the final division.
		1387	*/
		1388	unsigned stack_depth = 0;
		1389	for (unsigned i = 0; i < num_samples; ++i) {
		1390	assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */
		1391
		1392	/* Push sample i onto the stack */
		1393	assert(stack_depth < ARRAY_SIZE(texture_data));
		1394	if (i == 0) {
		1395	s_is_zero = true;
		1396	} else {
		1397	s_is_zero = false;
		1398	emit_mov(vec16(S), brw_imm_ud(i));
		1399	}
		1400	texel_fetch(texture_data[stack_depth++]);
		1401
		1402	if (i == 0 && key->tex_layout == INTEL_MSAA_LAYOUT_CMS) {
		1403	/* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
		1404	* suggests an optimization:
		1405	*
		1406	* "A simple optimization with probable large return in
		1407	* performance is to compare the MCS value to zero (indicating
		1408	* all samples are on sample slice 0), and sample only from
		1409	* sample slice 0 using ld2dss if MCS is zero."
		1410	*
		1411	* Note that in the case where the MCS value is zero, sampling from
		1412	* sample slice 0 using ld2dss and sampling from sample 0 using
		1413	* ld2dms are equivalent (since all samples are on sample slice 0).
		1414	* Since we have already sampled from sample 0, all we need to do is
		1415	* skip the remaining fetches and averaging if MCS is zero.
		1416	*/
		1417	emit_cmp_if(BRW_CONDITIONAL_NZ, mcs_data, brw_imm_ud(0));
		1418	}
		1419
		1420	/* Do count_trailing_one_bits(i) times */
		1421	for (int j = count_trailing_one_bits(i); j-- > 0; ) {
		1422	assert(stack_depth >= 2);
		1423	--stack_depth;
		1424
		1425	/* TODO: should use a smaller loop bound for non_RGBA formats */
		1426	for (int k = 0; k < 4; ++k) {
		1427	emit_combine(key->texture_data_type == BRW_REGISTER_TYPE_F ?
		1428	BRW_OPCODE_ADD : BRW_OPCODE_AVG,
		1429	offset(texture_data[stack_depth - 1], 2*k),
		1430	offset(vec8(texture_data[stack_depth - 1]), 2*k),
		1431	offset(vec8(texture_data[stack_depth]), 2*k));
		1432	}
		1433	}
		1434	}
		1435
		1436	/* We should have just 1 sample on the stack now. */
		1437	assert(stack_depth == 1);
		1438
		1439	if (key->texture_data_type == BRW_REGISTER_TYPE_F) {
		1440	/* Scale the result down by a factor of num_samples */
		1441	/* TODO: should use a smaller loop bound for non-RGBA formats */
		1442	for (int j = 0; j < 4; ++j) {
		1443	emit_mul(offset(texture_data[0], 2*j),
		1444	offset(vec8(texture_data[0]), 2*j),
		1445	brw_imm_f(1.0/num_samples));
		1446	}
		1447	}
		1448
		1449	if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
		1450	emit_endif();
		1451	}
		1452
		1453	void
		1454	brw_blorp_blit_program::manual_blend_bilinear(unsigned num_samples)
		1455	{
		1456	/* We do this computation by performing the following operations:
		1457	*
		1458	* In case of 4x, 8x MSAA:
		1459	* - Compute the pixel coordinates and sample numbers (a, b, c, d)
		1460	* which are later used for interpolation
		1461	* - linearly interpolate samples a and b in X
		1462	* - linearly interpolate samples c and d in X
		1463	* - linearly interpolate the results of last two operations in Y
		1464	*
		1465	* result = lrp(lrp(a + b) + lrp(c + d))
		1466	*/
		1467	struct brw_reg Xp_f = retype(Xp, BRW_REGISTER_TYPE_F);
		1468	struct brw_reg Yp_f = retype(Yp, BRW_REGISTER_TYPE_F);
		1469	struct brw_reg t1_f = retype(t1, BRW_REGISTER_TYPE_F);
		1470	struct brw_reg t2_f = retype(t2, BRW_REGISTER_TYPE_F);
		1471
		1472	for (unsigned i = 0; i < 4; ++i) {
		1473	assert(i < ARRAY_SIZE(texture_data));
		1474	s_is_zero = false;
		1475
		1476	/* Compute pixel coordinates */
		1477	emit_add(vec16(x_sample_coords), Xp_f,
		1478	brw_imm_f((float)(i & 0x1) * (1.0 / key->x_scale)));
		1479	emit_add(vec16(y_sample_coords), Yp_f,
		1480	brw_imm_f((float)((i >> 1) & 0x1) * (1.0 / key->y_scale)));
		1481	emit_mov(vec16(X), x_sample_coords);
		1482	emit_mov(vec16(Y), y_sample_coords);
		1483
		1484	/* The MCS value we fetch has to match up with the pixel that we're
		1485	* sampling from. Since we sample from different pixels in each
		1486	* iteration of this "for" loop, the call to mcs_fetch() should be
		1487	* here inside the loop after computing the pixel coordinates.
		1488	*/
		1489	if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
		1490	mcs_fetch();
		1491
		1492	/* Compute sample index and map the sample index to a sample number.
		1493	* Sample index layout shows the numbering of slots in a rectangular
		1494	* grid of samples with in a pixel. Sample number layout shows the
		1495	* rectangular grid of samples roughly corresponding to the real sample
		1496	* locations with in a pixel.
		1497	* In case of 4x MSAA, layout of sample indices matches the layout of
		1498	* sample numbers:
		1499	* ---------
		1500	* \| 0 \| 1 \|
		1501	* ---------
		1502	* \| 2 \| 3 \|
		1503	* ---------
		1504	*
		1505	* In case of 8x MSAA the two layouts don't match.
		1506	* sample index layout : --------- sample number layout : ---------
		1507	* \| 0 \| 1 \| \| 5 \| 2 \|
		1508	* --------- ---------
		1509	* \| 2 \| 3 \| \| 4 \| 6 \|
		1510	* --------- ---------
		1511	* \| 4 \| 5 \| \| 0 \| 3 \|
		1512	* --------- ---------
		1513	* \| 6 \| 7 \| \| 7 \| 1 \|
		1514	* --------- ---------
		1515	*/
		1516	emit_frc(vec16(t1_f), x_sample_coords);
		1517	emit_frc(vec16(t2_f), y_sample_coords);
		1518	emit_mul(vec16(t1_f), t1_f, brw_imm_f(key->x_scale));
		1519	emit_mul(vec16(t2_f), t2_f, brw_imm_f(key->x_scale * key->y_scale));
		1520	emit_add(vec16(t1_f), t1_f, t2_f);
		1521	emit_mov(vec16(S), t1_f);
		1522
		1523	if (num_samples == 8) {
		1524	/* Map the sample index to a sample number */
		1525	emit_cmp_if(BRW_CONDITIONAL_L, S, brw_imm_d(4));
		1526	{
		1527	emit_mov(vec16(t2), brw_imm_d(5));
		1528	emit_if_eq_mov(S, 1, vec16(t2), 2);
		1529	emit_if_eq_mov(S, 2, vec16(t2), 4);
		1530	emit_if_eq_mov(S, 3, vec16(t2), 6);
		1531	}
		1532	emit_else();
		1533	{
		1534	emit_mov(vec16(t2), brw_imm_d(0));
		1535	emit_if_eq_mov(S, 5, vec16(t2), 3);
		1536	emit_if_eq_mov(S, 6, vec16(t2), 7);
		1537	emit_if_eq_mov(S, 7, vec16(t2), 1);
		1538	}
		1539	emit_endif();
		1540	emit_mov(vec16(S), t2);
		1541	}
		1542	texel_fetch(texture_data[i]);
		1543	}
		1544
		1545	#define SAMPLE(x, y) offset(texture_data[x], y)
		1546	for (int index = 3; index > 0; ) {
		1547	/* Since we're doing SIMD16, 4 color channels fits in to 8 registers.
		1548	* Counter value of 8 in 'for' loop below is used to interpolate all
		1549	* the color components.
		1550	*/
		1551	for (int k = 0; k < 8; k += 2)
		1552	emit_lrp(vec8(SAMPLE(index - 1, k)),
		1553	x_frac,
		1554	vec8(SAMPLE(index, k)),
		1555	vec8(SAMPLE(index - 1, k)));
		1556	index -= 2;
		1557	}
		1558	for (int k = 0; k < 8; k += 2)
		1559	emit_lrp(vec8(SAMPLE(0, k)),
		1560	y_frac,
		1561	vec8(SAMPLE(2, k)),
		1562	vec8(SAMPLE(0, k)));
		1563	#undef SAMPLE
		1564	}
		1565
		1566	/**
		1567	* Emit code to look up a value in the texture using the SAMPLE message (which
		1568	* does blending of MSAA surfaces).
		1569	*/
		1570	void
		1571	brw_blorp_blit_program::sample(struct brw_reg dst)
		1572	{
		1573	static const sampler_message_arg args[2] = {
		1574	SAMPLER_MESSAGE_ARG_U_FLOAT,
		1575	SAMPLER_MESSAGE_ARG_V_FLOAT
		1576	};
		1577
		1578	texture_lookup(dst, SHADER_OPCODE_TEX, args, ARRAY_SIZE(args));
		1579	}
		1580
		1581	/**
		1582	* Emit code to look up a value in the texture using the SAMPLE_LD message
		1583	* (which does a simple texel fetch).
		1584	*/
		1585	void
		1586	brw_blorp_blit_program::texel_fetch(struct brw_reg dst)
		1587	{
		1588	static const sampler_message_arg gen6_args[5] = {
		1589	SAMPLER_MESSAGE_ARG_U_INT,
		1590	SAMPLER_MESSAGE_ARG_V_INT,
		1591	SAMPLER_MESSAGE_ARG_ZERO_INT, /* R */
		1592	SAMPLER_MESSAGE_ARG_ZERO_INT, /* LOD */
		1593	SAMPLER_MESSAGE_ARG_SI_INT
		1594	};
		1595	static const sampler_message_arg gen7_ld_args[3] = {
		1596	SAMPLER_MESSAGE_ARG_U_INT,
		1597	SAMPLER_MESSAGE_ARG_ZERO_INT, /* LOD */
		1598	SAMPLER_MESSAGE_ARG_V_INT
		1599	};
		1600	static const sampler_message_arg gen7_ld2dss_args[3] = {
		1601	SAMPLER_MESSAGE_ARG_SI_INT,
		1602	SAMPLER_MESSAGE_ARG_U_INT,
		1603	SAMPLER_MESSAGE_ARG_V_INT
		1604	};
		1605	static const sampler_message_arg gen7_ld2dms_args[4] = {
		1606	SAMPLER_MESSAGE_ARG_SI_INT,
		1607	SAMPLER_MESSAGE_ARG_MCS_INT,
		1608	SAMPLER_MESSAGE_ARG_U_INT,
		1609	SAMPLER_MESSAGE_ARG_V_INT
		1610	};
		1611
		1612	switch (brw->gen) {
		1613	case 6:
		1614	texture_lookup(dst, SHADER_OPCODE_TXF, gen6_args, s_is_zero ? 2 : 5);
		1615	break;
		1616	case 7:
		1617	switch (key->tex_layout) {
		1618	case INTEL_MSAA_LAYOUT_IMS:
		1619	/* From the Ivy Bridge PRM, Vol4 Part1 p72 (Multisampled Surface Storage
		1620	* Format):
		1621	*
		1622	* If this field is MSFMT_DEPTH_STENCIL
		1623	* [a.k.a. INTEL_MSAA_LAYOUT_IMS], the only sampling engine
		1624	* messages allowed are "ld2dms", "resinfo", and "sampleinfo".
		1625	*
		1626	* So fall through to emit the same message as we use for
		1627	* INTEL_MSAA_LAYOUT_CMS.
		1628	*/
		1629	case INTEL_MSAA_LAYOUT_CMS:
		1630	texture_lookup(dst, SHADER_OPCODE_TXF_CMS,
		1631	gen7_ld2dms_args, ARRAY_SIZE(gen7_ld2dms_args));
		1632	break;
		1633	case INTEL_MSAA_LAYOUT_UMS:
		1634	texture_lookup(dst, SHADER_OPCODE_TXF_UMS,
		1635	gen7_ld2dss_args, ARRAY_SIZE(gen7_ld2dss_args));
		1636	break;
		1637	case INTEL_MSAA_LAYOUT_NONE:
		1638	assert(s_is_zero);
		1639	texture_lookup(dst, SHADER_OPCODE_TXF, gen7_ld_args,
		1640	ARRAY_SIZE(gen7_ld_args));
		1641	break;
		1642	}
		1643	break;
		1644	default:
		1645	unreachable("Should not get here.");
		1646	};
		1647	}
		1648
		1649	void
		1650	brw_blorp_blit_program::mcs_fetch()
		1651	{
		1652	static const sampler_message_arg gen7_ld_mcs_args[2] = {
		1653	SAMPLER_MESSAGE_ARG_U_INT,
		1654	SAMPLER_MESSAGE_ARG_V_INT
		1655	};
		1656	texture_lookup(vec16(mcs_data), SHADER_OPCODE_TXF_MCS,
		1657	gen7_ld_mcs_args, ARRAY_SIZE(gen7_ld_mcs_args));
		1658	}
		1659
		1660	void
		1661	brw_blorp_blit_program::texture_lookup(struct brw_reg dst,
		1662	enum opcode op,
		1663	const sampler_message_arg *args,
		1664	int num_args)
		1665	{
		1666	struct brw_reg mrf =
		1667	retype(vec16(brw_message_reg(base_mrf)), BRW_REGISTER_TYPE_UD);
		1668	for (int arg = 0; arg < num_args; ++arg) {
		1669	switch (args[arg]) {
		1670	case SAMPLER_MESSAGE_ARG_U_FLOAT:
		1671	if (key->bilinear_filter)
		1672	emit_mov(retype(mrf, BRW_REGISTER_TYPE_F),
		1673	retype(X, BRW_REGISTER_TYPE_F));
		1674	else
		1675	emit_mov(retype(mrf, BRW_REGISTER_TYPE_F), X);
		1676	break;
		1677	case SAMPLER_MESSAGE_ARG_V_FLOAT:
		1678	if (key->bilinear_filter)
		1679	emit_mov(retype(mrf, BRW_REGISTER_TYPE_F),
		1680	retype(Y, BRW_REGISTER_TYPE_F));
		1681	else
		1682	emit_mov(retype(mrf, BRW_REGISTER_TYPE_F), Y);
		1683	break;
		1684	case SAMPLER_MESSAGE_ARG_U_INT:
		1685	emit_mov(mrf, X);
		1686	break;
		1687	case SAMPLER_MESSAGE_ARG_V_INT:
		1688	emit_mov(mrf, Y);
		1689	break;
		1690	case SAMPLER_MESSAGE_ARG_SI_INT:
		1691	/* Note: on Gen7, this code may be reached with s_is_zero==true
		1692	* because in Gen7's ld2dss message, the sample index is the first
		1693	* argument. When this happens, we need to move a 0 into the
		1694	* appropriate message register.
		1695	*/
		1696	if (s_is_zero)
		1697	emit_mov(mrf, brw_imm_ud(0));
		1698	else
		1699	emit_mov(mrf, S);
		1700	break;
		1701	case SAMPLER_MESSAGE_ARG_MCS_INT:
		1702	switch (key->tex_layout) {
		1703	case INTEL_MSAA_LAYOUT_CMS:
		1704	emit_mov(mrf, mcs_data);
		1705	break;
		1706	case INTEL_MSAA_LAYOUT_IMS:
		1707	/* When sampling from an IMS surface, MCS data is not relevant,
		1708	* and the hardware ignores it. So don't bother populating it.
		1709	*/
		1710	break;
		1711	default:
		1712	/* We shouldn't be trying to send MCS data with any other
		1713	* layouts.
		1714	*/
		1715	assert (!"Unsupported layout for MCS data");
		1716	break;
		1717	}
		1718	break;
		1719	case SAMPLER_MESSAGE_ARG_ZERO_INT:
		1720	emit_mov(mrf, brw_imm_ud(0));
		1721	break;
		1722	}
		1723	mrf.nr += 2;
		1724	}
		1725
		1726	emit_texture_lookup(retype(dst, BRW_REGISTER_TYPE_UW) /* dest */,
		1727	op,
		1728	base_mrf,
		1729	mrf.nr - base_mrf /* msg_length */);
		1730	}
		1731
		1732	#undef X
		1733	#undef Y
		1734	#undef U
		1735	#undef V
		1736	#undef S
		1737	#undef SWAP_XY_AND_XPYP
		1738
		1739	void
		1740	brw_blorp_blit_program::render_target_write()
		1741	{
		1742	struct brw_reg mrf_rt_write =
		1743	retype(vec16(brw_message_reg(base_mrf)), key->texture_data_type);
		1744	int mrf_offset = 0;
		1745
		1746	/* If we may have killed pixels, then we need to send R0 and R1 in a header
		1747	* so that the render target knows which pixels we killed.
		1748	*/
		1749	bool use_header = key->use_kill;
		1750	if (use_header) {
		1751	/* Copy R0/1 to MRF */
		1752	emit_mov(retype(mrf_rt_write, BRW_REGISTER_TYPE_UD),
		1753	retype(R0, BRW_REGISTER_TYPE_UD));
		1754	mrf_offset += 2;
		1755	}
		1756
		1757	/* Copy texture data to MRFs */
		1758	for (int i = 0; i < 4; ++i) {
		1759	/* E.g. mov(16) m2.0<1>:f r2.0<8;8,1>:f { Align1, H1 } */
		1760	emit_mov(offset(mrf_rt_write, mrf_offset),
		1761	offset(vec8(texture_data[0]), 2*i));
		1762	mrf_offset += 2;
		1763	}
		1764
		1765	/* Now write to the render target and terminate the thread */
		1766	emit_render_target_write(
		1767	mrf_rt_write,
		1768	base_mrf,
		1769	mrf_offset /* msg_length. TODO: Should be smaller for non-RGBA formats. */,
		1770	use_header);
		1771	}
		1772
		1773
		1774	void
		1775	brw_blorp_coord_transform_params::setup(GLfloat src0, GLfloat src1,
		1776	GLfloat dst0, GLfloat dst1,
		1777	bool mirror)
		1778	{
		1779	float scale = (src1 - src0) / (dst1 - dst0);
		1780	if (!mirror) {
		1781	/* When not mirroring a coordinate (say, X), we need:
		1782	* src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale
		1783	* Therefore:
		1784	* src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale
		1785	*
		1786	* blorp program uses "round toward zero" to convert the
		1787	* transformed floating point coordinates to integer coordinates,
		1788	* whereas the behaviour we actually want is "round to nearest",
		1789	* so 0.5 provides the necessary correction.
		1790	*/
		1791	multiplier = scale;
		1792	offset = src0 + (-dst0 + 0.5) * scale;
		1793	} else {
		1794	/* When mirroring X we need:
		1795	* src_x - src_x0 = dst_x1 - dst_x - 0.5
		1796	* Therefore:
		1797	* src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
		1798	*/
		1799	multiplier = -scale;
		1800	offset = src0 + (dst1 - 0.5) * scale;
		1801	}
		1802	}
		1803
		1804
		1805	/**
		1806	* Determine which MSAA layout the GPU pipeline should be configured for,
		1807	* based on the chip generation, the number of samples, and the true layout of
		1808	* the image in memory.
		1809	*/
		1810	inline intel_msaa_layout
		1811	compute_msaa_layout_for_pipeline(struct brw_context *brw, unsigned num_samples,
		1812	intel_msaa_layout true_layout)
		1813	{
		1814	if (num_samples <= 1) {
		1815	/* When configuring the GPU for non-MSAA, we can still accommodate IMS
		1816	* format buffers, by transforming coordinates appropriately.
		1817	*/
		1818	assert(true_layout == INTEL_MSAA_LAYOUT_NONE \|\|
		1819	true_layout == INTEL_MSAA_LAYOUT_IMS);
		1820	return INTEL_MSAA_LAYOUT_NONE;
		1821	} else {
		1822	assert(true_layout != INTEL_MSAA_LAYOUT_NONE);
		1823	}
		1824
		1825	/* Prior to Gen7, all MSAA surfaces use IMS layout. */
		1826	if (brw->gen == 6) {
		1827	assert(true_layout == INTEL_MSAA_LAYOUT_IMS);
		1828	}
		1829
		1830	return true_layout;
		1831	}
		1832
		1833
		1834	brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
		1835	struct intel_mipmap_tree *src_mt,
		1836	unsigned src_level, unsigned src_layer,
		1837	mesa_format src_format,
		1838	struct intel_mipmap_tree *dst_mt,
		1839	unsigned dst_level, unsigned dst_layer,
		1840	mesa_format dst_format,
		1841	GLfloat src_x0, GLfloat src_y0,
		1842	GLfloat src_x1, GLfloat src_y1,
		1843	GLfloat dst_x0, GLfloat dst_y0,
		1844	GLfloat dst_x1, GLfloat dst_y1,
		1845	GLenum filter,
		1846	bool mirror_x, bool mirror_y)
		1847	{
		1848	src.set(brw, src_mt, src_level, src_layer, src_format, false);
		1849	dst.set(brw, dst_mt, dst_level, dst_layer, dst_format, true);
		1850
		1851	/* Even though we do multisample resolves at the time of the blit, OpenGL
		1852	* specification defines them as if they happen at the time of rendering,
		1853	* which means that the type of averaging we do during the resolve should
		1854	* only depend on the source format; the destination format should be
		1855	* ignored. But, specification doesn't seem to be strict about it.
		1856	*
		1857	* It has been observed that mulitisample resolves produce slightly better
		1858	* looking images when averaging is done using destination format. NVIDIA's
		1859	* proprietary OpenGL driver also follow this approach. So, we choose to
		1860	* follow it in our driver.
		1861	*
		1862	* When multisampling, if the source and destination formats are equal
		1863	* (aside from the color space), we choose to blit in sRGB space to get
		1864	* this higher quality image.
		1865	*/
		1866	if (src.num_samples > 1 &&
		1867	_mesa_get_format_color_encoding(dst_mt->format) == GL_SRGB &&
		1868	_mesa_get_srgb_format_linear(src_mt->format) ==
		1869	_mesa_get_srgb_format_linear(dst_mt->format)) {
		1870	assert(brw->format_supported_as_render_target[dst_mt->format]);
		1871	dst.brw_surfaceformat = brw->render_target_format[dst_mt->format];
		1872	src.brw_surfaceformat = brw_format_for_mesa_format(dst_mt->format);
		1873	}
		1874
		1875	/* When doing a multisample resolve of a GL_LUMINANCE32F or GL_INTENSITY32F
		1876	* texture, the above code configures the source format for L32_FLOAT or
		1877	* I32_FLOAT, and the destination format for R32_FLOAT. On Sandy Bridge,
		1878	* the SAMPLE message appears to handle multisampled L32_FLOAT and
		1879	* I32_FLOAT textures incorrectly, resulting in blocky artifacts. So work
		1880	* around the problem by using a source format of R32_FLOAT. This
		1881	* shouldn't affect rendering correctness, since the destination format is
		1882	* R32_FLOAT, so only the contents of the red channel matters.
		1883	*/
		1884	if (brw->gen == 6 && src.num_samples > 1 && dst.num_samples <= 1 &&
		1885	src_mt->format == dst_mt->format &&
		1886	dst.brw_surfaceformat == BRW_SURFACEFORMAT_R32_FLOAT) {
		1887	src.brw_surfaceformat = dst.brw_surfaceformat;
		1888	}
		1889
		1890	use_wm_prog = true;
		1891	memset(&wm_prog_key, 0, sizeof(wm_prog_key));
		1892
		1893	/* texture_data_type indicates the register type that should be used to
		1894	* manipulate texture data.
		1895	*/
		1896	switch (_mesa_get_format_datatype(src_mt->format)) {
		1897	case GL_UNSIGNED_NORMALIZED:
		1898	case GL_SIGNED_NORMALIZED:
		1899	case GL_FLOAT:
		1900	wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F;
		1901	break;
		1902	case GL_UNSIGNED_INT:
		1903	if (src_mt->format == MESA_FORMAT_S_UINT8) {
		1904	/* We process stencil as though it's an unsigned normalized color */
		1905	wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F;
		1906	} else {
		1907	wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_UD;
		1908	}
		1909	break;
		1910	case GL_INT:
		1911	wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_D;
		1912	break;
		1913	default:
		1914	unreachable("Unrecognized blorp format");
		1915	}
		1916
		1917	if (brw->gen > 6) {
		1918	/* Gen7's rendering hardware only supports the IMS layout for depth and
		1919	* stencil render targets. Blorp always maps its destination surface as
		1920	* a color render target (even if it's actually a depth or stencil
		1921	* buffer). So if the destination is IMS, we'll have to map it as a
		1922	* single-sampled texture and interleave the samples ourselves.
		1923	*/
		1924	if (dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS)
		1925	dst.num_samples = 0;
		1926	}
		1927
		1928	if (dst.map_stencil_as_y_tiled && dst.num_samples > 1) {
		1929	/* If the destination surface is a W-tiled multisampled stencil buffer
		1930	* that we're mapping as Y tiled, then we need to arrange for the WM
		1931	* program to run once per sample rather than once per pixel, because
		1932	* the memory layout of related samples doesn't match between W and Y
		1933	* tiling.
		1934	*/
		1935	wm_prog_key.persample_msaa_dispatch = true;
		1936	}
		1937
		1938	if (src.num_samples > 0 && dst.num_samples > 1) {
		1939	/* We are blitting from a multisample buffer to a multisample buffer, so
		1940	* we must preserve samples within a pixel. This means we have to
		1941	* arrange for the WM program to run once per sample rather than once
		1942	* per pixel.
		1943	*/
		1944	wm_prog_key.persample_msaa_dispatch = true;
		1945	}
		1946
		1947	/* Scaled blitting or not. */
		1948	wm_prog_key.blit_scaled =
		1949	((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
		1950	(dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
		1951
		1952	/* Scaling factors used for bilinear filtering in multisample scaled
		1953	* blits.
		1954	*/
		1955	wm_prog_key.x_scale = 2.0;
		1956	wm_prog_key.y_scale = src_mt->num_samples / 2.0;
		1957
		1958	if (filter == GL_LINEAR && src.num_samples <= 1 && dst.num_samples <= 1)
		1959	wm_prog_key.bilinear_filter = true;
		1960
		1961	GLenum base_format = _mesa_get_format_base_format(src_mt->format);
		1962	if (base_format != GL_DEPTH_COMPONENT && /* TODO: what about depth/stencil? */
		1963	base_format != GL_STENCIL_INDEX &&
		1964	src_mt->num_samples > 1 && dst_mt->num_samples <= 1) {
		1965	/* We are downsampling a color buffer, so blend. */
		1966	wm_prog_key.blend = true;
		1967	}
		1968
		1969	/* src_samples and dst_samples are the true sample counts */
		1970	wm_prog_key.src_samples = src_mt->num_samples;
		1971	wm_prog_key.dst_samples = dst_mt->num_samples;
		1972
		1973	/* tex_samples and rt_samples are the sample counts that are set up in
		1974	* SURFACE_STATE.
		1975	*/
		1976	wm_prog_key.tex_samples = src.num_samples;
		1977	wm_prog_key.rt_samples = dst.num_samples;
		1978
		1979	/* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
		1980	* use to access the source and destination surfaces.
		1981	*/
		1982	wm_prog_key.tex_layout =
		1983	compute_msaa_layout_for_pipeline(brw, src.num_samples, src.msaa_layout);
		1984	wm_prog_key.rt_layout =
		1985	compute_msaa_layout_for_pipeline(brw, dst.num_samples, dst.msaa_layout);
		1986
		1987	/* src_layout and dst_layout indicate the true MSAA layout used by src and
		1988	* dst.
		1989	*/
		1990	wm_prog_key.src_layout = src_mt->msaa_layout;
		1991	wm_prog_key.dst_layout = dst_mt->msaa_layout;
		1992
		1993	wm_prog_key.src_tiled_w = src.map_stencil_as_y_tiled;
		1994	wm_prog_key.dst_tiled_w = dst.map_stencil_as_y_tiled;
		1995	/* Round floating point values to nearest integer to avoid "off by one texel"
		1996	* kind of errors when blitting.
		1997	*/
		1998	x0 = wm_push_consts.dst_x0 = roundf(dst_x0);
		1999	y0 = wm_push_consts.dst_y0 = roundf(dst_y0);
		2000	x1 = wm_push_consts.dst_x1 = roundf(dst_x1);
		2001	y1 = wm_push_consts.dst_y1 = roundf(dst_y1);
		2002	wm_push_consts.rect_grid_x1 = (minify(src_mt->logical_width0, src_level) *
		2003	wm_prog_key.x_scale - 1.0);
		2004	wm_push_consts.rect_grid_y1 = (minify(src_mt->logical_height0, src_level) *
		2005	wm_prog_key.y_scale - 1.0);
		2006
		2007	wm_push_consts.x_transform.setup(src_x0, src_x1, dst_x0, dst_x1, mirror_x);
		2008	wm_push_consts.y_transform.setup(src_y0, src_y1, dst_y0, dst_y1, mirror_y);
		2009
		2010	if (dst.num_samples <= 1 && dst_mt->num_samples > 1) {
		2011	/* We must expand the rectangle we send through the rendering pipeline,
		2012	* to account for the fact that we are mapping the destination region as
		2013	* single-sampled when it is in fact multisampled. We must also align
		2014	* it to a multiple of the multisampling pattern, because the
		2015	* differences between multisampled and single-sampled surface formats
		2016	* will mean that pixels are scrambled within the multisampling pattern.
		2017	* TODO: what if this makes the coordinates too large?
		2018	*
		2019	* Note: this only works if the destination surface uses the IMS layout.
		2020	* If it's UMS, then we have no choice but to set up the rendering
		2021	* pipeline as multisampled.
		2022	*/
		2023	assert(dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS);
		2024	switch (dst_mt->num_samples) {
		2025	case 4:
		2026	x0 = ROUND_DOWN_TO(x0 * 2, 4);
		2027	y0 = ROUND_DOWN_TO(y0 * 2, 4);
		2028	x1 = ALIGN(x1 * 2, 4);
		2029	y1 = ALIGN(y1 * 2, 4);
		2030	break;
		2031	case 8:
		2032	x0 = ROUND_DOWN_TO(x0 * 4, 8);
		2033	y0 = ROUND_DOWN_TO(y0 * 2, 4);
		2034	x1 = ALIGN(x1 * 4, 8);
		2035	y1 = ALIGN(y1 * 2, 4);
		2036	break;
		2037	default:
		2038	unreachable("Unrecognized sample count in brw_blorp_blit_params ctor");
		2039	}
		2040	wm_prog_key.use_kill = true;
		2041	}
		2042
		2043	if (dst.map_stencil_as_y_tiled) {
		2044	/* We must modify the rectangle we send through the rendering pipeline
		2045	* (and the size and x/y offset of the destination surface), to account
		2046	* for the fact that we are mapping it as Y-tiled when it is in fact
		2047	* W-tiled.
		2048	*
		2049	* Both Y tiling and W tiling can be understood as organizations of
		2050	* 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels
		2051	* is different, but the layout of the 32-byte sub-tiles within the 4k
		2052	* tile is the same (8 sub-tiles across by 16 sub-tiles down, in
		2053	* column-major order). In Y tiling, the sub-tiles are 16 bytes wide
		2054	* and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.
		2055	*
		2056	* Therefore, to account for the layout differences within the 32-byte
		2057	* sub-tiles, we must expand the rectangle so the X coordinates of its
		2058	* edges are multiples of 8 (the W sub-tile width), and its Y
		2059	* coordinates of its edges are multiples of 4 (the W sub-tile height).
		2060	* Then we need to scale the X and Y coordinates of the rectangle to
		2061	* account for the differences in aspect ratio between the Y and W
		2062	* sub-tiles. We need to modify the layer width and height similarly.
		2063	*
		2064	* A correction needs to be applied when MSAA is in use: since
		2065	* INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,
		2066	* we need to align the Y coordinates to multiples of 8, so that when
		2067	* they are divided by two they are still multiples of 4.
		2068	*
		2069	* Note: Since the x/y offset of the surface will be applied using the
		2070	* SURFACE_STATE command packet, it will be invisible to the swizzling
		2071	* code in the shader; therefore it needs to be in a multiple of the
		2072	* 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8
		2073	* pixels wide and 4 pixels high (when viewed as a W-tiled stencil
		2074	* buffer), and the miplevel alignment used for stencil buffers is 8
		2075	* pixels horizontally and either 4 or 8 pixels vertically (see
		2076	* intel_horizontal_texture_alignment_unit() and
		2077	* intel_vertical_texture_alignment_unit()).
		2078	*
		2079	* Note: Also, since the SURFACE_STATE command packet can only apply
		2080	* offsets that are multiples of 4 pixels horizontally and 2 pixels
		2081	* vertically, it is important that the offsets will be multiples of
		2082	* these sizes after they are converted into Y-tiled coordinates.
		2083	* Fortunately they will be, since we know from above that the offsets
		2084	* are a multiple of the 32-byte sub-tile size, and in Y-tiled
		2085	* coordinates the sub-tile is 16 pixels wide and 2 pixels high.
		2086	*
		2087	* TODO: what if this makes the coordinates (or the texture size) too
		2088	* large?
		2089	*/
		2090	const unsigned x_align = 8, y_align = dst.num_samples != 0 ? 8 : 4;
		2091	x0 = ROUND_DOWN_TO(x0, x_align) * 2;
		2092	y0 = ROUND_DOWN_TO(y0, y_align) / 2;
		2093	x1 = ALIGN(x1, x_align) * 2;
		2094	y1 = ALIGN(y1, y_align) / 2;
		2095	dst.width = ALIGN(dst.width, x_align) * 2;
		2096	dst.height = ALIGN(dst.height, y_align) / 2;
		2097	dst.x_offset *= 2;
		2098	dst.y_offset /= 2;
		2099	wm_prog_key.use_kill = true;
		2100	}
		2101
		2102	if (src.map_stencil_as_y_tiled) {
		2103	/* We must modify the size and x/y offset of the source surface to
		2104	* account for the fact that we are mapping it as Y-tiled when it is in
		2105	* fact W tiled.
		2106	*
		2107	* See the comments above concerning x/y offset alignment for the
		2108	* destination surface.
		2109	*
		2110	* TODO: what if this makes the texture size too large?
		2111	*/
		2112	const unsigned x_align = 8, y_align = src.num_samples != 0 ? 8 : 4;
		2113	src.width = ALIGN(src.width, x_align) * 2;
		2114	src.height = ALIGN(src.height, y_align) / 2;
		2115	src.x_offset *= 2;
		2116	src.y_offset /= 2;
		2117	}
		2118	}
		2119
		2120	uint32_t
		2121	brw_blorp_blit_params::get_wm_prog(struct brw_context *brw,
		2122	brw_blorp_prog_data **prog_data) const
		2123	{
		2124	uint32_t prog_offset = 0;
		2125	if (!brw_search_cache(&brw->cache, BRW_CACHE_BLORP_BLIT_PROG,
		2126	&this->wm_prog_key, sizeof(this->wm_prog_key),
		2127	&prog_offset, prog_data)) {
		2128	brw_blorp_blit_program prog(brw, &this->wm_prog_key,
		2129	INTEL_DEBUG & DEBUG_BLORP);
		2130	GLuint program_size;
		2131	const GLuint *program = prog.compile(brw, &program_size);
		2132	brw_upload_cache(&brw->cache, BRW_CACHE_BLORP_BLIT_PROG,
		2133	&this->wm_prog_key, sizeof(this->wm_prog_key),
		2134	program, program_size,
		2135	&prog.prog_data, sizeof(prog.prog_data),
		2136	&prog_offset, prog_data);
		2137	}
		2138	return prog_offset;
		2139	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/mesa-10.6.0/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp – Rev 5564