Subversion Repositories Kolibri OS

Rev

Rev 2997 | Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2005 serge 1
/*
2
 * Copyright 2010 Advanced Micro Devices, Inc.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE COPYRIGHT HOLDER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21
 * DEALINGS IN THE SOFTWARE.
22
 *
23
 * Authors:
24
 *     Alex Deucher 
25
 */
26
 
27
#include "drmP.h"
28
#include "drm.h"
29
#include "radeon_drm.h"
30
#include "radeon.h"
31
 
32
#include "evergreend.h"
33
#include "evergreen_blit_shaders.h"
34
#include "cayman_blit_shaders.h"
35
 
36
#define DI_PT_RECTLIST        0x11
37
#define DI_INDEX_SIZE_16_BIT  0x0
38
#define DI_SRC_SEL_AUTO_INDEX 0x2
39
 
40
#define FMT_8                 0x1
41
#define FMT_5_6_5             0x8
42
#define FMT_8_8_8_8           0x1a
43
#define COLOR_8               0x1
44
#define COLOR_5_6_5           0x8
45
#define COLOR_8_8_8_8         0x1a
46
 
47
/* emits 17 */
48
static void
49
set_render_target(struct radeon_device *rdev, int format,
50
		  int w, int h, u64 gpu_addr)
51
{
52
	u32 cb_color_info;
53
	int pitch, slice;
54
 
55
	h = ALIGN(h, 8);
56
	if (h < 8)
57
		h = 8;
58
 
59
	cb_color_info = ((format << 2) | (1 << 24) | (1 << 8));
60
	pitch = (w / 8) - 1;
61
	slice = ((w * h) / 64) - 1;
62
 
63
	radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONTEXT_REG, 15));
64
	radeon_ring_write(rdev, (CB_COLOR0_BASE - PACKET3_SET_CONTEXT_REG_START) >> 2);
65
	radeon_ring_write(rdev, gpu_addr >> 8);
66
	radeon_ring_write(rdev, pitch);
67
	radeon_ring_write(rdev, slice);
68
	radeon_ring_write(rdev, 0);
69
	radeon_ring_write(rdev, cb_color_info);
70
	radeon_ring_write(rdev, (1 << 4));
71
	radeon_ring_write(rdev, (w - 1) | ((h - 1) << 16));
72
	radeon_ring_write(rdev, 0);
73
	radeon_ring_write(rdev, 0);
74
	radeon_ring_write(rdev, 0);
75
	radeon_ring_write(rdev, 0);
76
	radeon_ring_write(rdev, 0);
77
	radeon_ring_write(rdev, 0);
78
	radeon_ring_write(rdev, 0);
79
	radeon_ring_write(rdev, 0);
80
}
81
 
82
/* emits 5dw */
83
static void
84
cp_set_surface_sync(struct radeon_device *rdev,
85
		    u32 sync_type, u32 size,
86
		    u64 mc_addr)
87
{
88
	u32 cp_coher_size;
89
 
90
	if (size == 0xffffffff)
91
		cp_coher_size = 0xffffffff;
92
	else
93
		cp_coher_size = ((size + 255) >> 8);
94
 
95
	radeon_ring_write(rdev, PACKET3(PACKET3_SURFACE_SYNC, 3));
96
	radeon_ring_write(rdev, sync_type);
97
	radeon_ring_write(rdev, cp_coher_size);
98
	radeon_ring_write(rdev, mc_addr >> 8);
99
	radeon_ring_write(rdev, 10); /* poll interval */
100
}
101
 
102
/* emits 11dw + 1 surface sync = 16dw */
103
static void
104
set_shaders(struct radeon_device *rdev)
105
{
106
	u64 gpu_addr;
107
 
108
	/* VS */
109
	gpu_addr = rdev->r600_blit.shader_gpu_addr + rdev->r600_blit.vs_offset;
110
	radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONTEXT_REG, 3));
111
	radeon_ring_write(rdev, (SQ_PGM_START_VS - PACKET3_SET_CONTEXT_REG_START) >> 2);
112
	radeon_ring_write(rdev, gpu_addr >> 8);
113
	radeon_ring_write(rdev, 2);
114
	radeon_ring_write(rdev, 0);
115
 
116
	/* PS */
117
	gpu_addr = rdev->r600_blit.shader_gpu_addr + rdev->r600_blit.ps_offset;
118
	radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONTEXT_REG, 4));
119
	radeon_ring_write(rdev, (SQ_PGM_START_PS - PACKET3_SET_CONTEXT_REG_START) >> 2);
120
	radeon_ring_write(rdev, gpu_addr >> 8);
121
	radeon_ring_write(rdev, 1);
122
	radeon_ring_write(rdev, 0);
123
	radeon_ring_write(rdev, 2);
124
 
125
	gpu_addr = rdev->r600_blit.shader_gpu_addr + rdev->r600_blit.vs_offset;
126
	cp_set_surface_sync(rdev, PACKET3_SH_ACTION_ENA, 512, gpu_addr);
127
}
128
 
129
/* emits 10 + 1 sync (5) = 15 */
130
static void
131
set_vtx_resource(struct radeon_device *rdev, u64 gpu_addr)
132
{
133
	u32 sq_vtx_constant_word2, sq_vtx_constant_word3;
134
 
135
	/* high addr, stride */
136
	sq_vtx_constant_word2 = ((upper_32_bits(gpu_addr) & 0xff) | (16 << 8));
137
#ifdef __BIG_ENDIAN
138
	sq_vtx_constant_word2 |= (2 << 30);
139
#endif
140
	/* xyzw swizzles */
141
	sq_vtx_constant_word3 = (0 << 3) | (1 << 6) | (2 << 9) | (3 << 12);
142
 
143
	radeon_ring_write(rdev, PACKET3(PACKET3_SET_RESOURCE, 8));
144
	radeon_ring_write(rdev, 0x580);
145
	radeon_ring_write(rdev, gpu_addr & 0xffffffff);
146
	radeon_ring_write(rdev, 48 - 1); /* size */
147
	radeon_ring_write(rdev, sq_vtx_constant_word2);
148
	radeon_ring_write(rdev, sq_vtx_constant_word3);
149
	radeon_ring_write(rdev, 0);
150
	radeon_ring_write(rdev, 0);
151
	radeon_ring_write(rdev, 0);
152
	radeon_ring_write(rdev, SQ_TEX_VTX_VALID_BUFFER << 30);
153
 
154
	if ((rdev->family == CHIP_CEDAR) ||
155
	    (rdev->family == CHIP_PALM) ||
156
	    (rdev->family == CHIP_SUMO) ||
157
	    (rdev->family == CHIP_SUMO2) ||
158
	    (rdev->family == CHIP_CAICOS))
159
		cp_set_surface_sync(rdev,
160
				    PACKET3_TC_ACTION_ENA, 48, gpu_addr);
161
	else
162
		cp_set_surface_sync(rdev,
163
				    PACKET3_VC_ACTION_ENA, 48, gpu_addr);
164
 
165
}
166
 
167
/* emits 10 */
168
static void
169
set_tex_resource(struct radeon_device *rdev,
170
		 int format, int w, int h, int pitch,
171
		 u64 gpu_addr)
172
{
173
	u32 sq_tex_resource_word0, sq_tex_resource_word1;
174
	u32 sq_tex_resource_word4, sq_tex_resource_word7;
175
 
176
	if (h < 1)
177
		h = 1;
178
 
179
	sq_tex_resource_word0 = (1 << 0); /* 2D */
180
	sq_tex_resource_word0 |= ((((pitch >> 3) - 1) << 6) |
181
				  ((w - 1) << 18));
182
	sq_tex_resource_word1 = ((h - 1) << 0) | (1 << 28);
183
	/* xyzw swizzles */
184
	sq_tex_resource_word4 = (0 << 16) | (1 << 19) | (2 << 22) | (3 << 25);
185
 
186
	sq_tex_resource_word7 = format | (SQ_TEX_VTX_VALID_TEXTURE << 30);
187
 
188
	radeon_ring_write(rdev, PACKET3(PACKET3_SET_RESOURCE, 8));
189
	radeon_ring_write(rdev, 0);
190
	radeon_ring_write(rdev, sq_tex_resource_word0);
191
	radeon_ring_write(rdev, sq_tex_resource_word1);
192
	radeon_ring_write(rdev, gpu_addr >> 8);
193
	radeon_ring_write(rdev, gpu_addr >> 8);
194
	radeon_ring_write(rdev, sq_tex_resource_word4);
195
	radeon_ring_write(rdev, 0);
196
	radeon_ring_write(rdev, 0);
197
	radeon_ring_write(rdev, sq_tex_resource_word7);
198
}
199
 
200
/* emits 12 */
201
static void
202
set_scissors(struct radeon_device *rdev, int x1, int y1,
203
	     int x2, int y2)
204
{
205
	/* workaround some hw bugs */
206
	if (x2 == 0)
207
		x1 = 1;
208
	if (y2 == 0)
209
		y1 = 1;
210
	if (rdev->family == CHIP_CAYMAN) {
211
		if ((x2 == 1) && (y2 == 1))
212
			x2 = 2;
213
	}
214
 
215
	radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONTEXT_REG, 2));
216
	radeon_ring_write(rdev, (PA_SC_SCREEN_SCISSOR_TL - PACKET3_SET_CONTEXT_REG_START) >> 2);
217
	radeon_ring_write(rdev, (x1 << 0) | (y1 << 16));
218
	radeon_ring_write(rdev, (x2 << 0) | (y2 << 16));
219
 
220
	radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONTEXT_REG, 2));
221
	radeon_ring_write(rdev, (PA_SC_GENERIC_SCISSOR_TL - PACKET3_SET_CONTEXT_REG_START) >> 2);
222
	radeon_ring_write(rdev, (x1 << 0) | (y1 << 16) | (1 << 31));
223
	radeon_ring_write(rdev, (x2 << 0) | (y2 << 16));
224
 
225
	radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONTEXT_REG, 2));
226
	radeon_ring_write(rdev, (PA_SC_WINDOW_SCISSOR_TL - PACKET3_SET_CONTEXT_REG_START) >> 2);
227
	radeon_ring_write(rdev, (x1 << 0) | (y1 << 16) | (1 << 31));
228
	radeon_ring_write(rdev, (x2 << 0) | (y2 << 16));
229
}
230
 
231
/* emits 10 */
232
static void
233
draw_auto(struct radeon_device *rdev)
234
{
235
	radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONFIG_REG, 1));
236
	radeon_ring_write(rdev, (VGT_PRIMITIVE_TYPE - PACKET3_SET_CONFIG_REG_START) >> 2);
237
	radeon_ring_write(rdev, DI_PT_RECTLIST);
238
 
239
	radeon_ring_write(rdev, PACKET3(PACKET3_INDEX_TYPE, 0));
240
	radeon_ring_write(rdev,
241
#ifdef __BIG_ENDIAN
242
			  (2 << 2) |
243
#endif
244
			  DI_INDEX_SIZE_16_BIT);
245
 
246
	radeon_ring_write(rdev, PACKET3(PACKET3_NUM_INSTANCES, 0));
247
	radeon_ring_write(rdev, 1);
248
 
249
	radeon_ring_write(rdev, PACKET3(PACKET3_DRAW_INDEX_AUTO, 1));
250
	radeon_ring_write(rdev, 3);
251
	radeon_ring_write(rdev, DI_SRC_SEL_AUTO_INDEX);
252
 
253
}
254
 
255
/* emits 39 */
256
static void
257
set_default_state(struct radeon_device *rdev)
258
{
259
	u32 sq_config, sq_gpr_resource_mgmt_1, sq_gpr_resource_mgmt_2, sq_gpr_resource_mgmt_3;
260
	u32 sq_thread_resource_mgmt, sq_thread_resource_mgmt_2;
261
	u32 sq_stack_resource_mgmt_1, sq_stack_resource_mgmt_2, sq_stack_resource_mgmt_3;
262
	int num_ps_gprs, num_vs_gprs, num_temp_gprs;
263
	int num_gs_gprs, num_es_gprs, num_hs_gprs, num_ls_gprs;
264
	int num_ps_threads, num_vs_threads, num_gs_threads, num_es_threads;
265
	int num_hs_threads, num_ls_threads;
266
	int num_ps_stack_entries, num_vs_stack_entries, num_gs_stack_entries, num_es_stack_entries;
267
	int num_hs_stack_entries, num_ls_stack_entries;
268
	u64 gpu_addr;
269
	int dwords;
270
 
271
	/* set clear context state */
272
	radeon_ring_write(rdev, PACKET3(PACKET3_CLEAR_STATE, 0));
273
	radeon_ring_write(rdev, 0);
274
 
275
	if (rdev->family < CHIP_CAYMAN) {
276
		switch (rdev->family) {
277
		case CHIP_CEDAR:
278
		default:
279
			num_ps_gprs = 93;
280
			num_vs_gprs = 46;
281
			num_temp_gprs = 4;
282
			num_gs_gprs = 31;
283
			num_es_gprs = 31;
284
			num_hs_gprs = 23;
285
			num_ls_gprs = 23;
286
			num_ps_threads = 96;
287
			num_vs_threads = 16;
288
			num_gs_threads = 16;
289
			num_es_threads = 16;
290
			num_hs_threads = 16;
291
			num_ls_threads = 16;
292
			num_ps_stack_entries = 42;
293
			num_vs_stack_entries = 42;
294
			num_gs_stack_entries = 42;
295
			num_es_stack_entries = 42;
296
			num_hs_stack_entries = 42;
297
			num_ls_stack_entries = 42;
298
			break;
299
		case CHIP_REDWOOD:
300
			num_ps_gprs = 93;
301
			num_vs_gprs = 46;
302
			num_temp_gprs = 4;
303
			num_gs_gprs = 31;
304
			num_es_gprs = 31;
305
			num_hs_gprs = 23;
306
			num_ls_gprs = 23;
307
			num_ps_threads = 128;
308
			num_vs_threads = 20;
309
			num_gs_threads = 20;
310
			num_es_threads = 20;
311
			num_hs_threads = 20;
312
			num_ls_threads = 20;
313
			num_ps_stack_entries = 42;
314
			num_vs_stack_entries = 42;
315
			num_gs_stack_entries = 42;
316
			num_es_stack_entries = 42;
317
			num_hs_stack_entries = 42;
318
			num_ls_stack_entries = 42;
319
			break;
320
		case CHIP_JUNIPER:
321
			num_ps_gprs = 93;
322
			num_vs_gprs = 46;
323
			num_temp_gprs = 4;
324
			num_gs_gprs = 31;
325
			num_es_gprs = 31;
326
			num_hs_gprs = 23;
327
			num_ls_gprs = 23;
328
			num_ps_threads = 128;
329
			num_vs_threads = 20;
330
			num_gs_threads = 20;
331
			num_es_threads = 20;
332
			num_hs_threads = 20;
333
			num_ls_threads = 20;
334
			num_ps_stack_entries = 85;
335
			num_vs_stack_entries = 85;
336
			num_gs_stack_entries = 85;
337
			num_es_stack_entries = 85;
338
			num_hs_stack_entries = 85;
339
			num_ls_stack_entries = 85;
340
			break;
341
		case CHIP_CYPRESS:
342
		case CHIP_HEMLOCK:
343
			num_ps_gprs = 93;
344
			num_vs_gprs = 46;
345
			num_temp_gprs = 4;
346
			num_gs_gprs = 31;
347
			num_es_gprs = 31;
348
			num_hs_gprs = 23;
349
			num_ls_gprs = 23;
350
			num_ps_threads = 128;
351
			num_vs_threads = 20;
352
			num_gs_threads = 20;
353
			num_es_threads = 20;
354
			num_hs_threads = 20;
355
			num_ls_threads = 20;
356
			num_ps_stack_entries = 85;
357
			num_vs_stack_entries = 85;
358
			num_gs_stack_entries = 85;
359
			num_es_stack_entries = 85;
360
			num_hs_stack_entries = 85;
361
			num_ls_stack_entries = 85;
362
			break;
363
		case CHIP_PALM:
364
			num_ps_gprs = 93;
365
			num_vs_gprs = 46;
366
			num_temp_gprs = 4;
367
			num_gs_gprs = 31;
368
			num_es_gprs = 31;
369
			num_hs_gprs = 23;
370
			num_ls_gprs = 23;
371
			num_ps_threads = 96;
372
			num_vs_threads = 16;
373
			num_gs_threads = 16;
374
			num_es_threads = 16;
375
			num_hs_threads = 16;
376
			num_ls_threads = 16;
377
			num_ps_stack_entries = 42;
378
			num_vs_stack_entries = 42;
379
			num_gs_stack_entries = 42;
380
			num_es_stack_entries = 42;
381
			num_hs_stack_entries = 42;
382
			num_ls_stack_entries = 42;
383
			break;
384
		case CHIP_SUMO:
385
			num_ps_gprs = 93;
386
			num_vs_gprs = 46;
387
			num_temp_gprs = 4;
388
			num_gs_gprs = 31;
389
			num_es_gprs = 31;
390
			num_hs_gprs = 23;
391
			num_ls_gprs = 23;
392
			num_ps_threads = 96;
393
			num_vs_threads = 25;
394
			num_gs_threads = 25;
395
			num_es_threads = 25;
396
			num_hs_threads = 25;
397
			num_ls_threads = 25;
398
			num_ps_stack_entries = 42;
399
			num_vs_stack_entries = 42;
400
			num_gs_stack_entries = 42;
401
			num_es_stack_entries = 42;
402
			num_hs_stack_entries = 42;
403
			num_ls_stack_entries = 42;
404
			break;
405
		case CHIP_SUMO2:
406
			num_ps_gprs = 93;
407
			num_vs_gprs = 46;
408
			num_temp_gprs = 4;
409
			num_gs_gprs = 31;
410
			num_es_gprs = 31;
411
			num_hs_gprs = 23;
412
			num_ls_gprs = 23;
413
			num_ps_threads = 96;
414
			num_vs_threads = 25;
415
			num_gs_threads = 25;
416
			num_es_threads = 25;
417
			num_hs_threads = 25;
418
			num_ls_threads = 25;
419
			num_ps_stack_entries = 85;
420
			num_vs_stack_entries = 85;
421
			num_gs_stack_entries = 85;
422
			num_es_stack_entries = 85;
423
			num_hs_stack_entries = 85;
424
			num_ls_stack_entries = 85;
425
			break;
426
		case CHIP_BARTS:
427
			num_ps_gprs = 93;
428
			num_vs_gprs = 46;
429
			num_temp_gprs = 4;
430
			num_gs_gprs = 31;
431
			num_es_gprs = 31;
432
			num_hs_gprs = 23;
433
			num_ls_gprs = 23;
434
			num_ps_threads = 128;
435
			num_vs_threads = 20;
436
			num_gs_threads = 20;
437
			num_es_threads = 20;
438
			num_hs_threads = 20;
439
			num_ls_threads = 20;
440
			num_ps_stack_entries = 85;
441
			num_vs_stack_entries = 85;
442
			num_gs_stack_entries = 85;
443
			num_es_stack_entries = 85;
444
			num_hs_stack_entries = 85;
445
			num_ls_stack_entries = 85;
446
			break;
447
		case CHIP_TURKS:
448
			num_ps_gprs = 93;
449
			num_vs_gprs = 46;
450
			num_temp_gprs = 4;
451
			num_gs_gprs = 31;
452
			num_es_gprs = 31;
453
			num_hs_gprs = 23;
454
			num_ls_gprs = 23;
455
			num_ps_threads = 128;
456
			num_vs_threads = 20;
457
			num_gs_threads = 20;
458
			num_es_threads = 20;
459
			num_hs_threads = 20;
460
			num_ls_threads = 20;
461
			num_ps_stack_entries = 42;
462
			num_vs_stack_entries = 42;
463
			num_gs_stack_entries = 42;
464
			num_es_stack_entries = 42;
465
			num_hs_stack_entries = 42;
466
			num_ls_stack_entries = 42;
467
			break;
468
		case CHIP_CAICOS:
469
			num_ps_gprs = 93;
470
			num_vs_gprs = 46;
471
			num_temp_gprs = 4;
472
			num_gs_gprs = 31;
473
			num_es_gprs = 31;
474
			num_hs_gprs = 23;
475
			num_ls_gprs = 23;
476
			num_ps_threads = 128;
477
			num_vs_threads = 10;
478
			num_gs_threads = 10;
479
			num_es_threads = 10;
480
			num_hs_threads = 10;
481
			num_ls_threads = 10;
482
			num_ps_stack_entries = 42;
483
			num_vs_stack_entries = 42;
484
			num_gs_stack_entries = 42;
485
			num_es_stack_entries = 42;
486
			num_hs_stack_entries = 42;
487
			num_ls_stack_entries = 42;
488
			break;
489
		}
490
 
491
		if ((rdev->family == CHIP_CEDAR) ||
492
		    (rdev->family == CHIP_PALM) ||
493
		    (rdev->family == CHIP_SUMO) ||
494
		    (rdev->family == CHIP_SUMO2) ||
495
		    (rdev->family == CHIP_CAICOS))
496
			sq_config = 0;
497
		else
498
			sq_config = VC_ENABLE;
499
 
500
		sq_config |= (EXPORT_SRC_C |
501
			      CS_PRIO(0) |
502
			      LS_PRIO(0) |
503
			      HS_PRIO(0) |
504
			      PS_PRIO(0) |
505
			      VS_PRIO(1) |
506
			      GS_PRIO(2) |
507
			      ES_PRIO(3));
508
 
509
		sq_gpr_resource_mgmt_1 = (NUM_PS_GPRS(num_ps_gprs) |
510
					  NUM_VS_GPRS(num_vs_gprs) |
511
					  NUM_CLAUSE_TEMP_GPRS(num_temp_gprs));
512
		sq_gpr_resource_mgmt_2 = (NUM_GS_GPRS(num_gs_gprs) |
513
					  NUM_ES_GPRS(num_es_gprs));
514
		sq_gpr_resource_mgmt_3 = (NUM_HS_GPRS(num_hs_gprs) |
515
					  NUM_LS_GPRS(num_ls_gprs));
516
		sq_thread_resource_mgmt = (NUM_PS_THREADS(num_ps_threads) |
517
					   NUM_VS_THREADS(num_vs_threads) |
518
					   NUM_GS_THREADS(num_gs_threads) |
519
					   NUM_ES_THREADS(num_es_threads));
520
		sq_thread_resource_mgmt_2 = (NUM_HS_THREADS(num_hs_threads) |
521
					     NUM_LS_THREADS(num_ls_threads));
522
		sq_stack_resource_mgmt_1 = (NUM_PS_STACK_ENTRIES(num_ps_stack_entries) |
523
					    NUM_VS_STACK_ENTRIES(num_vs_stack_entries));
524
		sq_stack_resource_mgmt_2 = (NUM_GS_STACK_ENTRIES(num_gs_stack_entries) |
525
					    NUM_ES_STACK_ENTRIES(num_es_stack_entries));
526
		sq_stack_resource_mgmt_3 = (NUM_HS_STACK_ENTRIES(num_hs_stack_entries) |
527
					    NUM_LS_STACK_ENTRIES(num_ls_stack_entries));
528
 
529
		/* disable dyn gprs */
530
		radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONFIG_REG, 1));
531
		radeon_ring_write(rdev, (SQ_DYN_GPR_CNTL_PS_FLUSH_REQ - PACKET3_SET_CONFIG_REG_START) >> 2);
532
		radeon_ring_write(rdev, 0);
533
 
534
		/* setup LDS */
535
		radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONFIG_REG, 1));
536
		radeon_ring_write(rdev, (SQ_LDS_RESOURCE_MGMT - PACKET3_SET_CONFIG_REG_START) >> 2);
537
		radeon_ring_write(rdev, 0x10001000);
538
 
539
		/* SQ config */
540
		radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONFIG_REG, 11));
541
		radeon_ring_write(rdev, (SQ_CONFIG - PACKET3_SET_CONFIG_REG_START) >> 2);
542
		radeon_ring_write(rdev, sq_config);
543
		radeon_ring_write(rdev, sq_gpr_resource_mgmt_1);
544
		radeon_ring_write(rdev, sq_gpr_resource_mgmt_2);
545
		radeon_ring_write(rdev, sq_gpr_resource_mgmt_3);
546
		radeon_ring_write(rdev, 0);
547
		radeon_ring_write(rdev, 0);
548
		radeon_ring_write(rdev, sq_thread_resource_mgmt);
549
		radeon_ring_write(rdev, sq_thread_resource_mgmt_2);
550
		radeon_ring_write(rdev, sq_stack_resource_mgmt_1);
551
		radeon_ring_write(rdev, sq_stack_resource_mgmt_2);
552
		radeon_ring_write(rdev, sq_stack_resource_mgmt_3);
553
	}
554
 
555
	/* CONTEXT_CONTROL */
556
	radeon_ring_write(rdev, 0xc0012800);
557
	radeon_ring_write(rdev, 0x80000000);
558
	radeon_ring_write(rdev, 0x80000000);
559
 
560
	/* SQ_VTX_BASE_VTX_LOC */
561
	radeon_ring_write(rdev, 0xc0026f00);
562
	radeon_ring_write(rdev, 0x00000000);
563
	radeon_ring_write(rdev, 0x00000000);
564
	radeon_ring_write(rdev, 0x00000000);
565
 
566
	/* SET_SAMPLER */
567
	radeon_ring_write(rdev, 0xc0036e00);
568
	radeon_ring_write(rdev, 0x00000000);
569
	radeon_ring_write(rdev, 0x00000012);
570
	radeon_ring_write(rdev, 0x00000000);
571
	radeon_ring_write(rdev, 0x00000000);
572
 
573
	/* set to DX10/11 mode */
574
	radeon_ring_write(rdev, PACKET3(PACKET3_MODE_CONTROL, 0));
575
	radeon_ring_write(rdev, 1);
576
 
577
	/* emit an IB pointing at default state */
578
	dwords = ALIGN(rdev->r600_blit.state_len, 0x10);
579
	gpu_addr = rdev->r600_blit.shader_gpu_addr + rdev->r600_blit.state_offset;
580
	radeon_ring_write(rdev, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
581
	radeon_ring_write(rdev, gpu_addr & 0xFFFFFFFC);
582
	radeon_ring_write(rdev, upper_32_bits(gpu_addr) & 0xFF);
583
	radeon_ring_write(rdev, dwords);
584
 
585
}
586
 
587
static inline uint32_t i2f(uint32_t input)
588
{
589
	u32 result, i, exponent, fraction;
590
 
591
	if ((input & 0x3fff) == 0)
592
		result = 0; /* 0 is a special case */
593
	else {
594
		exponent = 140; /* exponent biased by 127; */
595
		fraction = (input & 0x3fff) << 10; /* cheat and only
596
						      handle numbers below 2^^15 */
597
		for (i = 0; i < 14; i++) {
598
			if (fraction & 0x800000)
599
				break;
600
			else {
601
				fraction = fraction << 1; /* keep
602
							     shifting left until top bit = 1 */
603
				exponent = exponent - 1;
604
			}
605
		}
606
		result = exponent << 23 | (fraction & 0x7fffff); /* mask
607
								    off top bit; assumed 1 */
608
	}
609
	return result;
610
}
611
 
612
int evergreen_blit_init(struct radeon_device *rdev)
613
{
614
	u32 obj_size;
615
	int i, r, dwords;
616
	void *ptr;
617
	u32 packet2s[16];
618
	int num_packet2s = 0;
619
 
620
	/* pin copy shader into vram if already initialized */
621
	if (rdev->r600_blit.shader_obj)
622
		goto done;
623
 
624
	mutex_init(&rdev->r600_blit.mutex);
625
	rdev->r600_blit.state_offset = 0;
626
 
627
	if (rdev->family < CHIP_CAYMAN)
628
		rdev->r600_blit.state_len = evergreen_default_size;
629
	else
630
		rdev->r600_blit.state_len = cayman_default_size;
631
 
632
	dwords = rdev->r600_blit.state_len;
633
	while (dwords & 0xf) {
634
		packet2s[num_packet2s++] = cpu_to_le32(PACKET2(0));
635
		dwords++;
636
	}
637
 
638
	obj_size = dwords * 4;
639
	obj_size = ALIGN(obj_size, 256);
640
 
641
	rdev->r600_blit.vs_offset = obj_size;
642
	if (rdev->family < CHIP_CAYMAN)
643
		obj_size += evergreen_vs_size * 4;
644
	else
645
		obj_size += cayman_vs_size * 4;
646
	obj_size = ALIGN(obj_size, 256);
647
 
648
	rdev->r600_blit.ps_offset = obj_size;
649
	if (rdev->family < CHIP_CAYMAN)
650
		obj_size += evergreen_ps_size * 4;
651
	else
652
		obj_size += cayman_ps_size * 4;
653
	obj_size = ALIGN(obj_size, 256);
654
 
655
	r = radeon_bo_create(rdev, obj_size, PAGE_SIZE, true, RADEON_GEM_DOMAIN_VRAM,
656
				&rdev->r600_blit.shader_obj);
657
	if (r) {
658
		DRM_ERROR("evergreen failed to allocate shader\n");
659
		return r;
660
	}
661
 
662
	DRM_DEBUG("evergreen blit allocated bo %08x vs %08x ps %08x\n",
663
		  obj_size,
664
		  rdev->r600_blit.vs_offset, rdev->r600_blit.ps_offset);
665
 
666
	r = radeon_bo_reserve(rdev->r600_blit.shader_obj, false);
667
	if (unlikely(r != 0))
668
		return r;
669
	r = radeon_bo_kmap(rdev->r600_blit.shader_obj, &ptr);
670
	if (r) {
671
		DRM_ERROR("failed to map blit object %d\n", r);
672
		return r;
673
	}
674
 
675
	if (rdev->family < CHIP_CAYMAN) {
676
        memcpy(ptr + rdev->r600_blit.state_offset,
677
			    evergreen_default_state, rdev->r600_blit.state_len * 4);
678
 
679
		if (num_packet2s)
680
            memcpy(ptr + rdev->r600_blit.state_offset + (rdev->r600_blit.state_len * 4),
681
				    packet2s, num_packet2s * 4);
682
		for (i = 0; i < evergreen_vs_size; i++)
683
			*(u32 *)((unsigned long)ptr + rdev->r600_blit.vs_offset + i * 4) = cpu_to_le32(evergreen_vs[i]);
684
		for (i = 0; i < evergreen_ps_size; i++)
685
			*(u32 *)((unsigned long)ptr + rdev->r600_blit.ps_offset + i * 4) = cpu_to_le32(evergreen_ps[i]);
686
	} else {
687
        memcpy(ptr + rdev->r600_blit.state_offset,
688
			    cayman_default_state, rdev->r600_blit.state_len * 4);
689
 
690
		if (num_packet2s)
691
            memcpy(ptr + rdev->r600_blit.state_offset + (rdev->r600_blit.state_len * 4),
692
				    packet2s, num_packet2s * 4);
693
		for (i = 0; i < cayman_vs_size; i++)
694
			*(u32 *)((unsigned long)ptr + rdev->r600_blit.vs_offset + i * 4) = cpu_to_le32(cayman_vs[i]);
695
		for (i = 0; i < cayman_ps_size; i++)
696
			*(u32 *)((unsigned long)ptr + rdev->r600_blit.ps_offset + i * 4) = cpu_to_le32(cayman_ps[i]);
697
	}
698
	radeon_bo_kunmap(rdev->r600_blit.shader_obj);
699
	radeon_bo_unreserve(rdev->r600_blit.shader_obj);
700
 
701
done:
702
	r = radeon_bo_reserve(rdev->r600_blit.shader_obj, false);
703
	if (unlikely(r != 0))
704
		return r;
705
	r = radeon_bo_pin(rdev->r600_blit.shader_obj, RADEON_GEM_DOMAIN_VRAM,
706
			  &rdev->r600_blit.shader_gpu_addr);
707
	radeon_bo_unreserve(rdev->r600_blit.shader_obj);
708
	if (r) {
709
		dev_err(rdev->dev, "(%d) pin blit object failed\n", r);
710
		return r;
711
	}
712
//   radeon_ttm_set_active_vram_size(rdev, rdev->mc.real_vram_size);
713
	return 0;
714
}
715
 
716
void evergreen_blit_fini(struct radeon_device *rdev)
717
{
718
	int r;
719
 
720
//   radeon_ttm_set_active_vram_size(rdev, rdev->mc.visible_vram_size);
721
	if (rdev->r600_blit.shader_obj == NULL)
722
		return;
723
	/* If we can't reserve the bo, unref should be enough to destroy
724
	 * it when it becomes idle.
725
	 */
726
	r = radeon_bo_reserve(rdev->r600_blit.shader_obj, false);
727
	if (!r) {
728
		radeon_bo_unpin(rdev->r600_blit.shader_obj);
729
		radeon_bo_unreserve(rdev->r600_blit.shader_obj);
730
	}
731
	radeon_bo_unref(&rdev->r600_blit.shader_obj);
732
}
733
 
734
static int evergreen_vb_ib_get(struct radeon_device *rdev)
735
{
736
	int r;
737
	r = radeon_ib_get(rdev, &rdev->r600_blit.vb_ib);
738
	if (r) {
739
		DRM_ERROR("failed to get IB for vertex buffer\n");
740
		return r;
741
	}
742
 
743
	rdev->r600_blit.vb_total = 64*1024;
744
	rdev->r600_blit.vb_used = 0;
745
	return 0;
746
}
747
 
748
static void evergreen_vb_ib_put(struct radeon_device *rdev)
749
{
750
	radeon_fence_emit(rdev, rdev->r600_blit.vb_ib->fence);
751
	radeon_ib_free(rdev, &rdev->r600_blit.vb_ib);
752
}
753
 
754
int evergreen_blit_prepare_copy(struct radeon_device *rdev, int size_bytes)
755
{
756
	int r;
757
	int ring_size, line_size;
758
	int max_size;
759
	/* loops of emits + fence emit possible */
760
	int dwords_per_loop = 74, num_loops;
761
 
762
	r = evergreen_vb_ib_get(rdev);
763
	if (r)
764
		return r;
765
 
766
	/* 8 bpp vs 32 bpp for xfer unit */
767
	if (size_bytes & 3)
768
		line_size = 8192;
769
	else
770
		line_size = 8192 * 4;
771
 
772
	max_size = 8192 * line_size;
773
 
774
	/* major loops cover the max size transfer */
775
	num_loops = ((size_bytes + max_size) / max_size);
776
	/* minor loops cover the extra non aligned bits */
777
	num_loops += ((size_bytes % line_size) ? 1 : 0);
778
	/* calculate number of loops correctly */
779
	ring_size = num_loops * dwords_per_loop;
780
	/* set default  + shaders */
781
	ring_size += 55; /* shaders + def state */
782
	ring_size += 10; /* fence emit for VB IB */
783
	ring_size += 5; /* done copy */
784
	ring_size += 10; /* fence emit for done copy */
785
	r = radeon_ring_lock(rdev, ring_size);
786
	if (r)
787
		return r;
788
 
789
	set_default_state(rdev); /* 36 */
790
	set_shaders(rdev); /* 16 */
791
	return 0;
792
}
793
 
794
void evergreen_blit_done_copy(struct radeon_device *rdev, struct radeon_fence *fence)
795
{
796
	int r;
797
 
798
	if (rdev->r600_blit.vb_ib)
799
		evergreen_vb_ib_put(rdev);
800
 
801
	if (fence)
802
		r = radeon_fence_emit(rdev, fence);
803
 
804
	radeon_ring_unlock_commit(rdev);
805
}
806
 
807
void evergreen_kms_blit_copy(struct radeon_device *rdev,
808
			     u64 src_gpu_addr, u64 dst_gpu_addr,
809
			     int size_bytes)
810
{
811
	int max_bytes;
812
	u64 vb_gpu_addr;
813
	u32 *vb;
814
 
815
	DRM_DEBUG("emitting copy %16llx %16llx %d %d\n", src_gpu_addr, dst_gpu_addr,
816
		  size_bytes, rdev->r600_blit.vb_used);
817
	vb = (u32 *)(rdev->r600_blit.vb_ib->ptr + rdev->r600_blit.vb_used);
818
	if ((size_bytes & 3) || (src_gpu_addr & 3) || (dst_gpu_addr & 3)) {
819
		max_bytes = 8192;
820
 
821
		while (size_bytes) {
822
			int cur_size = size_bytes;
823
			int src_x = src_gpu_addr & 255;
824
			int dst_x = dst_gpu_addr & 255;
825
			int h = 1;
826
			src_gpu_addr = src_gpu_addr & ~255ULL;
827
			dst_gpu_addr = dst_gpu_addr & ~255ULL;
828
 
829
			if (!src_x && !dst_x) {
830
				h = (cur_size / max_bytes);
831
				if (h > 8192)
832
					h = 8192;
833
				if (h == 0)
834
					h = 1;
835
				else
836
					cur_size = max_bytes;
837
			} else {
838
				if (cur_size > max_bytes)
839
					cur_size = max_bytes;
840
				if (cur_size > (max_bytes - dst_x))
841
					cur_size = (max_bytes - dst_x);
842
				if (cur_size > (max_bytes - src_x))
843
					cur_size = (max_bytes - src_x);
844
			}
845
 
846
			if ((rdev->r600_blit.vb_used + 48) > rdev->r600_blit.vb_total) {
847
            //   WARN_ON(1);
848
			}
849
 
850
			vb[0] = i2f(dst_x);
851
			vb[1] = 0;
852
			vb[2] = i2f(src_x);
853
			vb[3] = 0;
854
 
855
			vb[4] = i2f(dst_x);
856
			vb[5] = i2f(h);
857
			vb[6] = i2f(src_x);
858
			vb[7] = i2f(h);
859
 
860
			vb[8] = i2f(dst_x + cur_size);
861
			vb[9] = i2f(h);
862
			vb[10] = i2f(src_x + cur_size);
863
			vb[11] = i2f(h);
864
 
865
			/* src 10 */
866
			set_tex_resource(rdev, FMT_8,
867
					 src_x + cur_size, h, src_x + cur_size,
868
					 src_gpu_addr);
869
 
870
			/* 5 */
871
			cp_set_surface_sync(rdev,
872
					    PACKET3_TC_ACTION_ENA, (src_x + cur_size * h), src_gpu_addr);
873
 
874
 
875
			/* dst 17 */
876
			set_render_target(rdev, COLOR_8,
877
					  dst_x + cur_size, h,
878
					  dst_gpu_addr);
879
 
880
			/* scissors 12 */
881
			set_scissors(rdev, dst_x, 0, dst_x + cur_size, h);
882
 
883
			/* 15 */
884
			vb_gpu_addr = rdev->r600_blit.vb_ib->gpu_addr + rdev->r600_blit.vb_used;
885
			set_vtx_resource(rdev, vb_gpu_addr);
886
 
887
			/* draw 10 */
888
			draw_auto(rdev);
889
 
890
			/* 5 */
891
			cp_set_surface_sync(rdev,
892
					    PACKET3_CB_ACTION_ENA | PACKET3_CB0_DEST_BASE_ENA,
893
					    cur_size * h, dst_gpu_addr);
894
 
895
			vb += 12;
896
			rdev->r600_blit.vb_used += 12 * 4;
897
 
898
			src_gpu_addr += cur_size * h;
899
			dst_gpu_addr += cur_size * h;
900
			size_bytes -= cur_size * h;
901
		}
902
	} else {
903
		max_bytes = 8192 * 4;
904
 
905
		while (size_bytes) {
906
			int cur_size = size_bytes;
907
			int src_x = (src_gpu_addr & 255);
908
			int dst_x = (dst_gpu_addr & 255);
909
			int h = 1;
910
			src_gpu_addr = src_gpu_addr & ~255ULL;
911
			dst_gpu_addr = dst_gpu_addr & ~255ULL;
912
 
913
			if (!src_x && !dst_x) {
914
				h = (cur_size / max_bytes);
915
				if (h > 8192)
916
					h = 8192;
917
				if (h == 0)
918
					h = 1;
919
				else
920
					cur_size = max_bytes;
921
			} else {
922
				if (cur_size > max_bytes)
923
					cur_size = max_bytes;
924
				if (cur_size > (max_bytes - dst_x))
925
					cur_size = (max_bytes - dst_x);
926
				if (cur_size > (max_bytes - src_x))
927
					cur_size = (max_bytes - src_x);
928
			}
929
 
930
			if ((rdev->r600_blit.vb_used + 48) > rdev->r600_blit.vb_total) {
931
           //        WARN_ON(1);
932
			}
933
 
934
			vb[0] = i2f(dst_x / 4);
935
			vb[1] = 0;
936
			vb[2] = i2f(src_x / 4);
937
			vb[3] = 0;
938
 
939
			vb[4] = i2f(dst_x / 4);
940
			vb[5] = i2f(h);
941
			vb[6] = i2f(src_x / 4);
942
			vb[7] = i2f(h);
943
 
944
			vb[8] = i2f((dst_x + cur_size) / 4);
945
			vb[9] = i2f(h);
946
			vb[10] = i2f((src_x + cur_size) / 4);
947
			vb[11] = i2f(h);
948
 
949
			/* src 10 */
950
			set_tex_resource(rdev, FMT_8_8_8_8,
951
					 (src_x + cur_size) / 4,
952
					 h, (src_x + cur_size) / 4,
953
					 src_gpu_addr);
954
			/* 5 */
955
			cp_set_surface_sync(rdev,
956
					    PACKET3_TC_ACTION_ENA, (src_x + cur_size * h), src_gpu_addr);
957
 
958
			/* dst 17 */
959
			set_render_target(rdev, COLOR_8_8_8_8,
960
					  (dst_x + cur_size) / 4, h,
961
					  dst_gpu_addr);
962
 
963
			/* scissors 12  */
964
			set_scissors(rdev, (dst_x / 4), 0, (dst_x + cur_size / 4), h);
965
 
966
			/* Vertex buffer setup 15 */
967
			vb_gpu_addr = rdev->r600_blit.vb_ib->gpu_addr + rdev->r600_blit.vb_used;
968
			set_vtx_resource(rdev, vb_gpu_addr);
969
 
970
			/* draw 10 */
971
			draw_auto(rdev);
972
 
973
			/* 5 */
974
			cp_set_surface_sync(rdev,
975
					    PACKET3_CB_ACTION_ENA | PACKET3_CB0_DEST_BASE_ENA,
976
					    cur_size * h, dst_gpu_addr);
977
 
978
			/* 74 ring dwords per loop */
979
			vb += 12;
980
			rdev->r600_blit.vb_used += 12 * 4;
981
 
982
			src_gpu_addr += cur_size * h;
983
			dst_gpu_addr += cur_size * h;
984
			size_bytes -= cur_size * h;
985
		}
986
	}
987
}
988