WebSVN – Kolibri OS – Path Comparison – / – /contrib/sdk/sources/ Rev 4500 and /contrib/sdk/sources/ Rev 4501

Ignore whitespace Rev 4500 → Rev 4501

 /contrib/sdk/sources/Intel-2D/Makefile
 ,6 → 13,7
 LDFLAGS:=  -shared -s -nostdlib -T ../newlib/dll.lds --entry _DllStartup --image-base=0
 PXFLAGS:= --version-script pixlib.ver --output-def $(LIBRARY).orig.def --out-implib $(LIBRARY).dll.a
 SNAFLAGS:= --version-script sna.ver --output-def sna.def
+UXAFLAGS:= --version-script uxa.ver --output-def uxa.def
 INCLUDES= -I. -I../libdrm/intel -I../libdrm/include/drm -I./render_program -I../pixman -I../newlib/include
 ,7 → 26,10
 SRC_PIXLIB =    pixlib.c
-SRC_SNA =       sna/gen3_render.c       \
+SRC_SNA =                               \
+                sna/gen4_common.c       \
+                sna/gen6_common.c       \
+                sna/gen3_render.c       \
                 sna/gen4_render.c       \
                 sna/gen4_vertex.c       \
                 sna/gen5_render.c       \
 ,7 → 74,7
 # targets
 all:$(LIBRARY).dll intel-sna.drv
-uxa:$(LIBRARY).dll
+uxa:$(LIBRARY).dll intel-uxa.drv
 ebox:$(LIBRARY).dll
 ,7 → 90,7
         mv -f $@ ../../bin
 intel-uxa.drv: $(OBJ_UXA) Makefile
-        $(LD) $(LDFLAGS) $(LIBPATH) -o $@ $(OBJ_UXA) $(LIBS)
+        $(LD) $(LDFLAGS) $(UXAFLAGS) $(LIBPATH) -o $@ $(OBJ_UXA) $(LIBS)
         $(STRIP) $@
         mv -f $@ ../../bin

/contrib/sdk/sources/Intel-2D/intel_driver.h
118,5 → 118,6

const struct intel_device_info intel_detect_chipset(struct pci_device pci);

#define hosted() (0)

#endif /* INTEL_DRIVER_H */

 /contrib/sdk/sources/Intel-2D/intel_list.h
 ,7 → 261,7
  * @return True if the list contains one or more elements or False otherwise.
  */
 static inline bool
-list_is_empty(struct list *head)
+list_is_empty(const struct list *head)
 {
     return head->next == head;
 }

 /contrib/sdk/sources/Intel-2D/sna/brw/brw_wm.c
 ,7 → 521,7
         if (p->gen >= 060) {
                 /* First compute 1/z */
                 brw_PLN(p,
-                        brw_message_reg(msg),
+                        brw_vec8_grf(30, 0),
                         brw_vec1_grf(uv+1, 0),
                         brw_vec8_grf(2, 0));
 ,22 → 532,22
                         brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
                 } else
                         brw_math_invert(p, brw_vec8_grf(30, 0), brw_vec8_grf(30, 0));
                 brw_PLN(p,
-                        brw_vec8_grf(28, 0),
+                        brw_vec8_grf(26, 0),
                         brw_vec1_grf(uv, 0),
                         brw_vec8_grf(2, 0));
-                brw_MUL(p,
-                        brw_message_reg(msg),
-                        brw_vec8_grf(28, 0),
-                        brw_vec8_grf(30, 0));
-                msg += dw/8;
                 brw_PLN(p,
                         brw_vec8_grf(28, 0),
                         brw_vec1_grf(uv, 0),
                         brw_vec8_grf(4, 0));
                 brw_MUL(p,
                         brw_message_reg(msg),
+                        brw_vec8_grf(26, 0),
+                        brw_vec8_grf(30, 0));
+                brw_MUL(p,
+                        brw_message_reg(msg + dw/8),
                         brw_vec8_grf(28, 0),
                         brw_vec8_grf(30, 0));
         } else {

 /contrib/sdk/sources/Intel-2D/sna/gen3_render.c
 ,7 → 1459,7
                         sna->render.vertices = sna->render.vertex_data;
                         sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
                         free_bo = bo;
-                } else if (IS_CPU_MAP(bo->map)) {
+                } else if (sna->render.vertices == MAP(bo->map__cpu)) {
                         DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
                         sna->render.vertices = kgem_bo_map__gtt(&sna->kgem, bo);
                         if (sna->render.vertices == NULL) {
 ,7 → 1657,123
         op->prim_emit(sna, op, r);
 }
+#if 0
+fastcall static void
+gen3_render_composite_box(struct sna *sna,
+                          const struct sna_composite_op *op,
+                          const BoxRec *box)
+{
+        struct sna_composite_rectangles r;
+        DBG(("%s: src=+(%d, %d), mask=+(%d, %d), dst=+(%d, %d)\n",
+             __FUNCTION__,
+             op->src.offset[0], op->src.offset[1],
+             op->mask.offset[0], op->mask.offset[1],
+             op->dst.x, op->dst.y));
+        gen3_get_rectangles(sna, op, 1);
+        r.dst.x  = box->x1;
+        r.dst.y  = box->y1;
+        r.width  = box->x2 - box->x1;
+        r.height = box->y2 - box->y1;
+        r.src = r.mask = r.dst;
+        op->prim_emit(sna, op, &r);
+}
 static void
+gen3_render_composite_boxes__blt(struct sna *sna,
+                                 const struct sna_composite_op *op,
+                                 const BoxRec *box, int nbox)
+{
+        DBG(("%s: nbox=%d, src=+(%d, %d), mask=+(%d, %d), dst=+(%d, %d)\n",
+             __FUNCTION__, nbox,
+             op->src.offset[0], op->src.offset[1],
+             op->mask.offset[0], op->mask.offset[1],
+             op->dst.x, op->dst.y));
+        do {
+                int nbox_this_time;
+                nbox_this_time = gen3_get_rectangles(sna, op, nbox);
+                nbox -= nbox_this_time;
+                do {
+                        struct sna_composite_rectangles r;
+                        DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
+                             box->x1, box->y1,
+                             box->x2 - box->x1,
+                             box->y2 - box->y1));
+                        r.dst.x  = box->x1; r.dst.y  = box->y1;
+                        r.width = box->x2 - box->x1;
+                        r.height = box->y2 - box->y1;
+                        r.src = r.mask = r.dst;
+                        op->prim_emit(sna, op, &r);
+                        box++;
+                } while (--nbox_this_time);
+        } while (nbox);
+}
+static void
+gen3_render_composite_boxes(struct sna *sna,
+                            const struct sna_composite_op *op,
+                            const BoxRec *box, int nbox)
+{
+        DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+        do {
+                int nbox_this_time;
+                float *v;
+                nbox_this_time = gen3_get_rectangles(sna, op, nbox);
+                assert(nbox_this_time);
+                nbox -= nbox_this_time;
+                v = sna->render.vertices + sna->render.vertex_used;
+                sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+                op->emit_boxes(op, box, nbox_this_time, v);
+                box += nbox_this_time;
+        } while (nbox);
+}
+static void
+gen3_render_composite_boxes__thread(struct sna *sna,
+                                    const struct sna_composite_op *op,
+                                    const BoxRec *box, int nbox)
+{
+        DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+        sna_vertex_lock(&sna->render);
+        do {
+                int nbox_this_time;
+                float *v;
+                nbox_this_time = gen3_get_rectangles(sna, op, nbox);
+                assert(nbox_this_time);
+                nbox -= nbox_this_time;
+                v = sna->render.vertices + sna->render.vertex_used;
+                sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+                sna_vertex_acquire__locked(&sna->render);
+                sna_vertex_unlock(&sna->render);
+                op->emit_boxes(op, box, nbox_this_time, v);
+                box += nbox_this_time;
+                sna_vertex_lock(&sna->render);
+                sna_vertex_release__locked(&sna->render);
+        } while (nbox);
+        sna_vertex_unlock(&sna->render);
+}
+#endif
+static void
 gen3_render_composite_done(struct sna *sna,
                            const struct sna_composite_op *op)
 {
 ,8 → 1818,7
         state->last_floats_per_vertex = 0;
         state->last_vertex_offset = 0;
-        if (sna->render.vbo != NULL &&
-            !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
+        if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
                 DBG(("%s: discarding vbo as next access will stall: %d\n",
                      __FUNCTION__, sna->render.vbo->presumed_offset));
                 discard_vbo(sna);
 ,7 → 1870,286
 }
 #if 0
+static bool source_is_covered(PicturePtr picture,
+                              int x, int y,
+                              int width, int height)
+{
+        int x1, y1, x2, y2;
+        if (picture->repeat && picture->repeatType != RepeatNone)
+                return true;
+        if (picture->pDrawable == NULL)
+                return false;
+        if (picture->transform) {
+                pixman_box16_t sample;
+                sample.x1 = x;
+                sample.y1 = y;
+                sample.x2 = x + width;
+                sample.y2 = y + height;
+                pixman_transform_bounds(picture->transform, &sample);
+                x1 = sample.x1;
+                x2 = sample.x2;
+                y1 = sample.y1;
+                y2 = sample.y2;
+        } else {
+                x1 = x;
+                y1 = y;
+                x2 = x + width;
+                y2 = y + height;
+        }
+        return
+                x1 >= 0 && y1 >= 0 &&
+                x2 <= picture->pDrawable->width &&
+                y2 <= picture->pDrawable->height;
+}
+static bool gen3_composite_channel_set_xformat(PicturePtr picture,
+                                               struct sna_composite_channel *channel,
+                                               int x, int y,
+                                               int width, int height)
+{
+        unsigned int i;
+        if (PICT_FORMAT_A(picture->format) != 0)
+                return false;
+        if (width == 0 || height == 0)
+                return false;
+        if (!source_is_covered(picture, x, y, width, height))
+                return false;
+        for (i = 0; i < ARRAY_SIZE(gen3_tex_formats); i++) {
+                if (gen3_tex_formats[i].xfmt == picture->format) {
+                        channel->card_format = gen3_tex_formats[i].card_fmt;
+                        channel->rb_reversed = gen3_tex_formats[i].rb_reversed;
+                        channel->alpha_fixup = true;
+                        return true;
+                }
+        }
+        return false;
+}
 static int
+gen3_init_solid(struct sna_composite_channel *channel, uint32_t color)
+{
+        channel->u.gen3.mode = color;
+        channel->u.gen3.type = SHADER_CONSTANT;
+        if (color == 0)
+                channel->u.gen3.type = SHADER_ZERO;
+        else if (color == 0xff000000)
+                channel->u.gen3.type = SHADER_BLACK;
+        else if (color == 0xffffffff)
+                channel->u.gen3.type = SHADER_WHITE;
+        channel->bo = NULL;
+        channel->is_opaque = (color >> 24) == 0xff;
+        channel->is_affine = 1;
+        channel->alpha_fixup = 0;
+        channel->rb_reversed = 0;
+        DBG(("%s: color=%08x, is_opaque=%d, type=%d\n",
+             __FUNCTION__, color, channel->is_opaque, channel->u.gen3.type));
+        /* for consistency */
+        channel->repeat = RepeatNormal;
+        channel->filter = PictFilterNearest;
+        channel->pict_format = PICT_a8r8g8b8;
+        channel->card_format = MAPSURF_32BIT | MT_32BIT_ARGB8888;
+        return 1;
+}
+static void gen3_composite_channel_convert(struct sna_composite_channel *channel)
+{
+        if (channel->u.gen3.type == SHADER_TEXTURE)
+                channel->repeat = gen3_texture_repeat(channel->repeat);
+        else
+                channel->repeat = gen3_gradient_repeat(channel->repeat);
+        channel->filter = gen3_filter(channel->filter);
+        if (channel->card_format == 0)
+                gen3_composite_channel_set_format(channel, channel->pict_format);
+        assert(channel->card_format);
+}
+static bool gen3_gradient_setup(struct sna *sna,
+                                PicturePtr picture,
+                                struct sna_composite_channel *channel,
+                                int16_t ox, int16_t oy)
+{
+        int16_t dx, dy;
+        if (picture->repeat == 0) {
+                channel->repeat = RepeatNone;
+        } else switch (picture->repeatType) {
+        case RepeatNone:
+        case RepeatNormal:
+        case RepeatPad:
+        case RepeatReflect:
+                channel->repeat = picture->repeatType;
+                break;
+        default:
+                return false;
+        }
+        channel->bo =
+                sna_render_get_gradient(sna,
+                                        (PictGradient *)picture->pSourcePict);
+        if (channel->bo == NULL)
+                return false;
+        channel->pict_format = PICT_a8r8g8b8;
+        channel->card_format = MAPSURF_32BIT | MT_32BIT_ARGB8888;
+        channel->filter = PictFilterNearest;
+        channel->is_affine = sna_transform_is_affine(picture->transform);
+        if (sna_transform_is_integer_translation(picture->transform, &dx, &dy)) {
+                DBG(("%s: integer translation (%d, %d), removing\n",
+                     __FUNCTION__, dx, dy));
+                ox += dx;
+                oy += dy;
+                channel->transform = NULL;
+        } else
+                channel->transform = picture->transform;
+        channel->width  = channel->bo->pitch / 4;
+        channel->height = 1;
+        channel->offset[0] = ox;
+        channel->offset[1] = oy;
+        channel->scale[0] = channel->scale[1] = 1;
+        return true;
+}
+static int
+gen3_init_linear(struct sna *sna,
+                 PicturePtr picture,
+                 struct sna_composite_op *op,
+                 struct sna_composite_channel *channel,
+                 int ox, int oy)
+{
+        PictLinearGradient *linear =
+                (PictLinearGradient *)picture->pSourcePict;
+        float x0, y0, sf;
+        float dx, dy, offset;
+        int n;
+        DBG(("%s: p1=(%f, %f), p2=(%f, %f)\n",
+             __FUNCTION__,
+             xFixedToDouble(linear->p1.x), xFixedToDouble(linear->p1.y),
+             xFixedToDouble(linear->p2.x), xFixedToDouble(linear->p2.y)));
+        if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
+                return 0;
+        dx = xFixedToDouble(linear->p2.x - linear->p1.x);
+        dy = xFixedToDouble(linear->p2.y - linear->p1.y);
+        sf = dx*dx + dy*dy;
+        dx /= sf;
+        dy /= sf;
+        x0 = xFixedToDouble(linear->p1.x);
+        y0 = xFixedToDouble(linear->p1.y);
+        offset = dx*x0 + dy*y0;
+        n = op->u.gen3.num_constants;
+        channel->u.gen3.constants = FS_C0 + n / 4;
+        op->u.gen3.constants[n++] = dx;
+        op->u.gen3.constants[n++] = dy;
+        op->u.gen3.constants[n++] = -offset;
+        op->u.gen3.constants[n++] = 0;
+        if (!gen3_gradient_setup(sna, picture, channel, ox, oy))
+                return -1;
+        channel->u.gen3.type = SHADER_LINEAR;
+        op->u.gen3.num_constants = n;
+        DBG(("%s: dx=%f, dy=%f, offset=%f, constants=%d\n",
+             __FUNCTION__, dx, dy, -offset, channel->u.gen3.constants - FS_C0));
+        return 1;
+}
+static int
+gen3_init_radial(struct sna *sna,
+                 PicturePtr picture,
+                 struct sna_composite_op *op,
+                 struct sna_composite_channel *channel,
+                 int ox, int oy)
+{
+        PictRadialGradient *radial = (PictRadialGradient *)picture->pSourcePict;
+        double dx, dy, dr, r1;
+        int n;
+        dx = xFixedToDouble(radial->c2.x - radial->c1.x);
+        dy = xFixedToDouble(radial->c2.y - radial->c1.y);
+        dr = xFixedToDouble(radial->c2.radius - radial->c1.radius);
+        r1 = xFixedToDouble(radial->c1.radius);
+        n = op->u.gen3.num_constants;
+        channel->u.gen3.constants = FS_C0 + n / 4;
+        if (radial->c2.x == radial->c1.x && radial->c2.y == radial->c1.y) {
+                if (radial->c2.radius == radial->c1.radius) {
+                        channel->u.gen3.type = SHADER_ZERO;
+                        return 1;
+                }
+                op->u.gen3.constants[n++] = xFixedToDouble(radial->c1.x) / dr;
+                op->u.gen3.constants[n++] = xFixedToDouble(radial->c1.y) / dr;
+                op->u.gen3.constants[n++] = 1. / dr;
+                op->u.gen3.constants[n++] = -r1 / dr;
+                channel->u.gen3.mode = RADIAL_ONE;
+        } else {
+                op->u.gen3.constants[n++] = -xFixedToDouble(radial->c1.x);
+                op->u.gen3.constants[n++] = -xFixedToDouble(radial->c1.y);
+                op->u.gen3.constants[n++] = r1;
+                op->u.gen3.constants[n++] = -4 * (dx*dx + dy*dy - dr*dr);
+                op->u.gen3.constants[n++] = -2 * dx;
+                op->u.gen3.constants[n++] = -2 * dy;
+                op->u.gen3.constants[n++] = -2 * r1 * dr;
+                op->u.gen3.constants[n++] = 1 / (2 * (dx*dx + dy*dy - dr*dr));
+                channel->u.gen3.mode = RADIAL_TWO;
+        }
+        if (!gen3_gradient_setup(sna, picture, channel, ox, oy))
+                return -1;
+        channel->u.gen3.type = SHADER_RADIAL;
+        op->u.gen3.num_constants = n;
+        return 1;
+}
+static bool
+sna_picture_is_clear(PicturePtr picture,
+                     int x, int y, int w, int h,
+                     uint32_t *color)
+{
+        struct sna_pixmap *priv;
+        if (!picture->pDrawable)
+                return false;
+        priv = sna_pixmap(get_drawable_pixmap(picture->pDrawable));
+        if (priv == NULL || !priv->clear)
+                return false;
+        if (!source_is_covered(picture, x, y, w, h))
+                return false;
+        *color = priv->clear_color;
+        return true;
+}
+static int
 gen3_composite_picture(struct sna *sna,
                        PicturePtr picture,
                        struct sna_composite_op *op,
 ,74 → 2268,37
         return sna_render_pixmap_bo(sna, channel, pixmap,
                                     x, y, w, h, dst_x, dst_y);
 }
+#endif
-static inline bool
-source_use_blt(struct sna *sna, PicturePtr picture)
+static void
+gen3_align_vertex(struct sna *sna,
+                  const struct sna_composite_op *op)
 {
-        /* If it is a solid, try to use the BLT paths */
-        if (!picture->pDrawable)
-                return picture->pSourcePict->type == SourcePictTypeSolidFill;
+        int vertex_index;
-        if (picture->pDrawable->width  == 1 &&
-            picture->pDrawable->height == 1 &&
-            picture->repeat)
-                return true;
+        if (op->floats_per_vertex == sna->render_state.gen3.last_floats_per_vertex)
+                return;
-        if (too_large(picture->pDrawable->width, picture->pDrawable->height))
-                return true;
+        DBG(("aligning vertex: was %d, now %d floats per vertex\n",
+             sna->render_state.gen3.last_floats_per_vertex,
+             op->floats_per_vertex));
-        return !is_gpu(sna, picture->pDrawable, PREFER_GPU_RENDER);
-}
+        assert(op->floats_per_rect == 3*op->floats_per_vertex);
-static bool
-try_blt(struct sna *sna,
-        PicturePtr dst,
-        PicturePtr src,
-        int width, int height)
-{
-        if (sna->kgem.mode != KGEM_RENDER) {
-                DBG(("%s: already performing BLT\n", __FUNCTION__));
-                return true;
-        }
+        vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
+        if ((int)sna->render.vertex_size - vertex_index * op->floats_per_vertex < 2*op->floats_per_rect) {
+                DBG(("%s: flushing vertex buffer: new index=%d, max=%d\n",
+                     __FUNCTION__, vertex_index, sna->render.vertex_size / op->floats_per_vertex));
+                if (gen3_vertex_finish(sna) < op->floats_per_vertex)
+                        kgem_submit(&sna->kgem);
-        if (too_large(width, height)) {
-                DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
-                     __FUNCTION__, width, height));
-                return true;
+                vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
         }
-        if (too_large(dst->pDrawable->width, dst->pDrawable->height)) {
-                DBG(("%s: target too large for 3D pipe (%d, %d)\n",
-                     __FUNCTION__,
-                     dst->pDrawable->width, dst->pDrawable->height));
-                return true;
-        }
-        /* is the source picture only in cpu memory e.g. a shm pixmap? */
-        return source_use_blt(sna, src);
+        sna->render.vertex_index = vertex_index;
+        sna->render.vertex_used = vertex_index * op->floats_per_vertex;
 }
-#endif
-static void
-gen3_align_vertex(struct sna *sna,
-                  const struct sna_composite_op *op)
-{
-        if (op->floats_per_vertex != sna->render_state.gen3.last_floats_per_vertex) {
-                if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-                        gen3_vertex_finish(sna);
-                DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
-                     sna->render_state.gen3.last_floats_per_vertex,
-                     op->floats_per_vertex,
-                     sna->render.vertex_index,
-                     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
-                sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
-                sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
-                assert(sna->render.vertex_used < sna->render.vertex_size - op->floats_per_rect);
-                sna->render_state.gen3.last_floats_per_vertex = op->floats_per_vertex;
-        }
-}
 static inline bool is_constant_ps(uint32_t type)
 {
         switch (type) {
 ,6 → 2416,58
         return dst_use_cpu(dst_pixmap);
 }
+static int
+reuse_source(struct sna *sna,
+             PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
+             PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
+{
+        if (src_x != msk_x || src_y != msk_y)
+                return false;
+        if (mask == src) {
+                *mc = *sc;
+                if (mc->bo)
+                        kgem_bo_reference(mc->bo);
+                return true;
+        }
+        if ((src->pDrawable == NULL || mask->pDrawable != src->pDrawable))
+                return false;
+        if (sc->is_solid)
+                return false;
+        DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
+        if (!sna_transform_equal(src->transform, mask->transform))
+                return false;
+        if (!sna_picture_alphamap_equal(src, mask))
+                return false;
+        if (!gen3_check_repeat(mask))
+                return false;
+        if (!gen3_check_filter(mask))
+                return false;
+        if (!gen3_check_format(mask))
+                return false;
+        DBG(("%s: reusing source channel for mask with a twist\n",
+             __FUNCTION__));
+        *mc = *sc;
+        mc->repeat = gen3_texture_repeat(mask->repeat ? mask->repeatType : RepeatNone);
+        mc->filter = gen3_filter(mask->filter);
+        mc->pict_format = mask->format;
+        gen3_composite_channel_set_format(mc, mask->format);
+        assert(mc->card_format);
+        if (mc->bo)
+                kgem_bo_reference(mc->bo);
+        return true;
+}
 static bool
 gen3_render_composite(struct sna *sna,
                       uint8_t op,
 ,7 → 2492,6
          * 3D -> 2D context switch.
          */
         if (mask == NULL &&
-            try_blt(sna, dst, src, width, height) &&
             sna_blt_composite(sna,
                               op, src, dst,
                               src_x, src_y,
 ,7 → 2501,7
                 return true;
         if (gen3_composite_fallback(sna, op, src, mask, dst))
-                return false;
+                goto fallback;
         if (need_tiling(sna, width, height))
                 return sna_tiling_composite(op, src, mask, dst,
 ,7 → 2525,7
                 if (!sna_render_composite_redirect(sna, tmp,
                                                    dst_x, dst_y, width, height,
                                                    op > PictOpSrc || dst->pCompositeClip->data))
-                        return false;
+                        goto fallback;
         }
         tmp->u.gen3.num_constants = 0;
 ,8 → 2814,8
                         goto cleanup_mask;
         }
+        gen3_align_vertex(sna, tmp);
         gen3_emit_composite_state(sna, tmp);
-        gen3_align_vertex(sna, tmp);
         return true;
 cleanup_mask:
 ,221 → 2827,1720
 cleanup_dst:
         if (tmp->redirect.real_bo)
                 kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
-        return false;
+fallback:
+        return (mask == NULL &&
+                sna_blt_composite(sna,
+                                  op, src, dst,
+                                  src_x, src_y,
+                                  dst_x, dst_y,
+                                  width, height,
+                                  tmp, true));
 }
-#endif
+static void
+gen3_emit_composite_spans_vertex(struct sna *sna,
+                                 const struct sna_composite_spans_op *op,
+                                 int16_t x, int16_t y,
+                                 float opacity)
+{
+        gen3_emit_composite_dstcoord(sna, x + op->base.dst.x, y + op->base.dst.y);
+        gen3_emit_composite_texcoord(sna, &op->base.src, x, y);
+        OUT_VERTEX(opacity);
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_zero(struct sna *sna,
+                                         const struct sna_composite_spans_op *op,
+                                         const BoxRec *box,
+                                         float opacity)
+{
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 6;
+        v[0] = op->base.dst.x + box->x2;
+        v[1] = op->base.dst.y + box->y2;
+        v[2] = op->base.dst.x + box->x1;
+        v[3] = v[1];
+        v[4] = v[2];
+        v[5] = op->base.dst.x + box->y1;
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_zero__boxes(const struct sna_composite_spans_op *op,
+                                                const struct sna_opacity_box *b,
+                                                int nbox, float *v)
+{
+        do {
+                v[0] = op->base.dst.x + b->box.x2;
+                v[1] = op->base.dst.y + b->box.y2;
+                v[2] = op->base.dst.x + b->box.x1;
+                v[3] = v[1];
+                v[4] = v[2];
+                v[5] = op->base.dst.x + b->box.y1;
+                v += 6;
+                b++;
+        } while (--nbox);
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_zero_no_offset(struct sna *sna,
+                                                   const struct sna_composite_spans_op *op,
+                                                   const BoxRec *box,
+                                                   float opacity)
+{
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 6;
+        v[0] = box->x2;
+        v[3] = v[1] = box->y2;
+        v[4] = v[2] = box->x1;
+        v[5] = box->y1;
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_zero_no_offset__boxes(const struct sna_composite_spans_op *op,
+                                                          const struct sna_opacity_box *b,
+                                                          int nbox, float *v)
+{
+        do {
+                v[0] = b->box.x2;
+                v[3] = v[1] = b->box.y2;
+                v[4] = v[2] = b->box.x1;
+                v[5] = b->box.y1;
+                b++;
+                v += 6;
+        } while (--nbox);
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_constant(struct sna *sna,
+                                             const struct sna_composite_spans_op *op,
+                                             const BoxRec *box,
+                                             float opacity)
+{
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 9;
+        v[0] = op->base.dst.x + box->x2;
+        v[6] = v[3] = op->base.dst.x + box->x1;
+        v[4] = v[1] = op->base.dst.y + box->y2;
+        v[7] = op->base.dst.y + box->y1;
+        v[8] = v[5] = v[2] = opacity;
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_constant__boxes(const struct sna_composite_spans_op *op,
+                                                    const struct sna_opacity_box *b,
+                                                    int nbox,
+                                                    float *v)
+{
+        do {
+                v[0] = op->base.dst.x + b->box.x2;
+                v[6] = v[3] = op->base.dst.x + b->box.x1;
+                v[4] = v[1] = op->base.dst.y + b->box.y2;
+                v[7] = op->base.dst.y + b->box.y1;
+                v[8] = v[5] = v[2] = b->alpha;
+                v += 9;
+                b++;
+        } while (--nbox);
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_constant_no_offset(struct sna *sna,
+                                                       const struct sna_composite_spans_op *op,
+                                                       const BoxRec *box,
+                                                       float opacity)
+{
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 9;
+        v[0] = box->x2;
+        v[6] = v[3] = box->x1;
+        v[4] = v[1] = box->y2;
+        v[7] = box->y1;
+        v[8] = v[5] = v[2] = opacity;
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_constant_no_offset__boxes(const struct sna_composite_spans_op *op,
+                                                              const struct sna_opacity_box *b,
+                                                              int nbox, float *v)
+{
+        do {
+                v[0] = b->box.x2;
+                v[6] = v[3] = b->box.x1;
+                v[4] = v[1] = b->box.y2;
+                v[7] = b->box.y1;
+                v[8] = v[5] = v[2] = b->alpha;
+                v += 9;
+                b++;
+        } while (--nbox);
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_identity_source(struct sna *sna,
+                                                    const struct sna_composite_spans_op *op,
+                                                    const BoxRec *box,
+                                                    float opacity)
+{
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 15;
+        v[0] = op->base.dst.x + box->x2;
+        v[1] = op->base.dst.y + box->y2;
+        v[2] = (op->base.src.offset[0] + box->x2) * op->base.src.scale[0];
+        v[3] = (op->base.src.offset[1] + box->y2) * op->base.src.scale[1];
+        v[4] = opacity;
+        v[5] = op->base.dst.x + box->x1;
+        v[6] = v[1];
+        v[7] = (op->base.src.offset[0] + box->x1) * op->base.src.scale[0];
+        v[8] = v[3];
+        v[9] = opacity;
+        v[10] = v[5];
+        v[11] = op->base.dst.y + box->y1;
+        v[12] = v[7];
+        v[13] = (op->base.src.offset[1] + box->y1) * op->base.src.scale[1];
+        v[14] = opacity;
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_identity_source__boxes(const struct sna_composite_spans_op *op,
+                                                           const struct sna_opacity_box *b,
+                                                           int nbox,
+                                                           float *v)
+{
+        do {
+                v[0] = op->base.dst.x + b->box.x2;
+                v[1] = op->base.dst.y + b->box.y2;
+                v[2] = (op->base.src.offset[0] + b->box.x2) * op->base.src.scale[0];
+                v[3] = (op->base.src.offset[1] + b->box.y2) * op->base.src.scale[1];
+                v[4] = b->alpha;
+                v[5] = op->base.dst.x + b->box.x1;
+                v[6] = v[1];
+                v[7] = (op->base.src.offset[0] + b->box.x1) * op->base.src.scale[0];
+                v[8] = v[3];
+                v[9] = b->alpha;
+                v[10] = v[5];
+                v[11] = op->base.dst.y + b->box.y1;
+                v[12] = v[7];
+                v[13] = (op->base.src.offset[1] + b->box.y1) * op->base.src.scale[1];
+                v[14] = b->alpha;
+                v += 15;
+                b++;
+        } while (--nbox);
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_affine_source(struct sna *sna,
+                                                  const struct sna_composite_spans_op *op,
+                                                  const BoxRec *box,
+                                                  float opacity)
+{
+        PictTransform *transform = op->base.src.transform;
+        float *v;
+        v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 15;
+        v[0]  = op->base.dst.x + box->x2;
+        v[6]  = v[1] = op->base.dst.y + box->y2;
+        v[10] = v[5] = op->base.dst.x + box->x1;
+        v[11] = op->base.dst.y + box->y1;
+        v[14] = v[9] = v[4]  = opacity;
+        _sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
+                                    (int)op->base.src.offset[1] + box->y2,
+                                    transform, op->base.src.scale,
+                                    &v[2], &v[3]);
+        _sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+                                    (int)op->base.src.offset[1] + box->y2,
+                                    transform, op->base.src.scale,
+                                    &v[7], &v[8]);
+        _sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+                                    (int)op->base.src.offset[1] + box->y1,
+                                    transform, op->base.src.scale,
+                                    &v[12], &v[13]);
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_affine_source__boxes(const struct sna_composite_spans_op *op,
+                                                         const struct sna_opacity_box *b,
+                                                         int nbox,
+                                                         float *v)
+{
+        PictTransform *transform = op->base.src.transform;
+        do {
+                v[0]  = op->base.dst.x + b->box.x2;
+                v[6]  = v[1] = op->base.dst.y + b->box.y2;
+                v[10] = v[5] = op->base.dst.x + b->box.x1;
+                v[11] = op->base.dst.y + b->box.y1;
+                v[14] = v[9] = v[4]  = b->alpha;
+                _sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x2,
+                                            (int)op->base.src.offset[1] + b->box.y2,
+                                            transform, op->base.src.scale,
+                                            &v[2], &v[3]);
+                _sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+                                            (int)op->base.src.offset[1] + b->box.y2,
+                                            transform, op->base.src.scale,
+                                            &v[7], &v[8]);
+                _sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+                                            (int)op->base.src.offset[1] + b->box.y1,
+                                            transform, op->base.src.scale,
+                                            &v[12], &v[13]);
+                v += 15;
+                b++;
+        } while (--nbox);
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_identity_gradient(struct sna *sna,
+                                                      const struct sna_composite_spans_op *op,
+                                                      const BoxRec *box,
+                                                      float opacity)
+{
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 15;
+        v[0] = op->base.dst.x + box->x2;
+        v[1] = op->base.dst.y + box->y2;
+        v[2] = op->base.src.offset[0] + box->x2;
+        v[3] = op->base.src.offset[1] + box->y2;
+        v[4] = opacity;
+        v[5] = op->base.dst.x + box->x1;
+        v[6] = v[1];
+        v[7] = op->base.src.offset[0] + box->x1;
+        v[8] = v[3];
+        v[9] = opacity;
+        v[10] = v[5];
+        v[11] = op->base.dst.y + box->y1;
+        v[12] = v[7];
+        v[13] = op->base.src.offset[1] + box->y1;
+        v[14] = opacity;
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_identity_gradient__boxes(const struct sna_composite_spans_op *op,
+                                                             const struct sna_opacity_box *b,
+                                                             int nbox,
+                                                             float *v)
+{
+        do {
+                v[0] = op->base.dst.x + b->box.x2;
+                v[1] = op->base.dst.y + b->box.y2;
+                v[2] = op->base.src.offset[0] + b->box.x2;
+                v[3] = op->base.src.offset[1] + b->box.y2;
+                v[4] = b->alpha;
+                v[5] = op->base.dst.x + b->box.x1;
+                v[6] = v[1];
+                v[7] = op->base.src.offset[0] + b->box.x1;
+                v[8] = v[3];
+                v[9] = b->alpha;
+                v[10] = v[5];
+                v[11] = op->base.dst.y + b->box.y1;
+                v[12] = v[7];
+                v[13] = op->base.src.offset[1] + b->box.y1;
+                v[14] = b->alpha;
+                v += 15;
+                b++;
+        } while (--nbox);
+}
+#if defined(sse2) && !defined(__x86_64__)
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_constant__sse2(struct sna *sna,
+                                                   const struct sna_composite_spans_op *op,
+                                                   const BoxRec *box,
+                                                   float opacity)
+{
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 9;
+        v[0] = op->base.dst.x + box->x2;
+        v[6] = v[3] = op->base.dst.x + box->x1;
+        v[4] = v[1] = op->base.dst.y + box->y2;
+        v[7] = op->base.dst.y + box->y1;
+        v[8] = v[5] = v[2] = opacity;
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_constant__sse2__boxes(const struct sna_composite_spans_op *op,
+                                                          const struct sna_opacity_box *b,
+                                                          int nbox,
+                                                          float *v)
+{
+        do {
+                v[0] = op->base.dst.x + b->box.x2;
+                v[6] = v[3] = op->base.dst.x + b->box.x1;
+                v[4] = v[1] = op->base.dst.y + b->box.y2;
+                v[7] = op->base.dst.y + b->box.y1;
+                v[8] = v[5] = v[2] = b->alpha;
+                v += 9;
+                b++;
+        } while (--nbox);
+}
+sse2 fastcall static void
+gen3_render_composite_spans_constant_box__sse2(struct sna *sna,
+                                               const struct sna_composite_spans_op *op,
+                                               const BoxRec *box, float opacity)
+{
+        float *v;
+        DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
+             __FUNCTION__,
+             op->base.src.offset[0], op->base.src.offset[1],
+             opacity,
+             op->base.dst.x, op->base.dst.y,
+             box->x1, box->y1,
+             box->x2 - box->x1,
+             box->y2 - box->y1));
+        gen3_get_rectangles(sna, &op->base, 1);
+        v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 9;
+        v[0] = box->x2;
+        v[6] = v[3] = box->x1;
+        v[4] = v[1] = box->y2;
+        v[7] = box->y1;
+        v[8] = v[5] = v[2] = opacity;
+}
+sse2 fastcall static void
+gen3_render_composite_spans_constant_thread__sse2__boxes(struct sna *sna,
+                                                         const struct sna_composite_spans_op *op,
+                                                         const struct sna_opacity_box *box,
+                                                         int nbox)
+{
+        DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+             __FUNCTION__, nbox,
+             op->base.src.offset[0], op->base.src.offset[1],
+             op->base.dst.x, op->base.dst.y));
+        sna_vertex_lock(&sna->render);
+        do {
+                int nbox_this_time;
+                float *v;
+                nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+                assert(nbox_this_time);
+                nbox -= nbox_this_time;
+                v = sna->render.vertices + sna->render.vertex_used;
+                sna->render.vertex_used += nbox_this_time * 9;
+                sna_vertex_acquire__locked(&sna->render);
+                sna_vertex_unlock(&sna->render);
+                do {
+                        v[0] = box->box.x2;
+                        v[6] = v[3] = box->box.x1;
+                        v[4] = v[1] = box->box.y2;
+                        v[7] = box->box.y1;
+                        v[8] = v[5] = v[2] = box->alpha;
+                        v += 9;
+                        box++;
+                } while (--nbox_this_time);
+                sna_vertex_lock(&sna->render);
+                sna_vertex_release__locked(&sna->render);
+        } while (nbox);
+        sna_vertex_unlock(&sna->render);
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_constant__sse2__no_offset(struct sna *sna,
+                                                              const struct sna_composite_spans_op *op,
+                                                              const BoxRec *box,
+                                                              float opacity)
+{
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 9;
+        v[0] = box->x2;
+        v[6] = v[3] = box->x1;
+        v[4] = v[1] = box->y2;
+        v[7] = box->y1;
+        v[8] = v[5] = v[2] = opacity;
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_constant__sse2__no_offset__boxes(const struct sna_composite_spans_op *op,
+                                                                     const struct sna_opacity_box *b,
+                                                                     int nbox, float *v)
+{
+        do {
+                v[0] = b->box.x2;
+                v[6] = v[3] = b->box.x1;
+                v[4] = v[1] = b->box.y2;
+                v[7] = b->box.y1;
+                v[8] = v[5] = v[2] = b->alpha;
+                v += 9;
+                b++;
+        } while (--nbox);
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_identity_source__sse2(struct sna *sna,
+                                                          const struct sna_composite_spans_op *op,
+                                                          const BoxRec *box,
+                                                          float opacity)
+{
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 15;
+        v[0] = op->base.dst.x + box->x2;
+        v[1] = op->base.dst.y + box->y2;
+        v[2] = (op->base.src.offset[0] + box->x2) * op->base.src.scale[0];
+        v[3] = (op->base.src.offset[1] + box->y2) * op->base.src.scale[1];
+        v[4] = opacity;
+        v[5] = op->base.dst.x + box->x1;
+        v[6] = v[1];
+        v[7] = (op->base.src.offset[0] + box->x1) * op->base.src.scale[0];
+        v[8] = v[3];
+        v[9] = opacity;
+        v[10] = v[5];
+        v[11] = op->base.dst.y + box->y1;
+        v[12] = v[7];
+        v[13] = (op->base.src.offset[1] + box->y1) * op->base.src.scale[1];
+        v[14] = opacity;
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_identity_source__sse2__boxes(const struct sna_composite_spans_op *op,
+                                                                 const struct sna_opacity_box *b,
+                                                                 int nbox,
+                                                                 float *v)
+{
+        do {
+                v[0] = op->base.dst.x + b->box.x2;
+                v[1] = op->base.dst.y + b->box.y2;
+                v[2] = (op->base.src.offset[0] + b->box.x2) * op->base.src.scale[0];
+                v[3] = (op->base.src.offset[1] + b->box.y2) * op->base.src.scale[1];
+                v[4] = b->alpha;
+                v[5] = op->base.dst.x + b->box.x1;
+                v[6] = v[1];
+                v[7] = (op->base.src.offset[0] + b->box.x1) * op->base.src.scale[0];
+                v[8] = v[3];
+                v[9] = b->alpha;
+                v[10] = v[5];
+                v[11] = op->base.dst.y + b->box.y1;
+                v[12] = v[7];
+                v[13] = (op->base.src.offset[1] + b->box.y1) * op->base.src.scale[1];
+                v[14] = b->alpha;
+                v += 15;
+                b++;
+        } while (--nbox);
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_affine_source__sse2(struct sna *sna,
+                                                        const struct sna_composite_spans_op *op,
+                                                        const BoxRec *box,
+                                                        float opacity)
+{
+        PictTransform *transform = op->base.src.transform;
+        float *v;
+        v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 15;
+        v[0]  = op->base.dst.x + box->x2;
+        v[6]  = v[1] = op->base.dst.y + box->y2;
+        v[10] = v[5] = op->base.dst.x + box->x1;
+        v[11] = op->base.dst.y + box->y1;
+        v[14] = v[9] = v[4]  = opacity;
+        _sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
+                                    (int)op->base.src.offset[1] + box->y2,
+                                    transform, op->base.src.scale,
+                                    &v[2], &v[3]);
+        _sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+                                    (int)op->base.src.offset[1] + box->y2,
+                                    transform, op->base.src.scale,
+                                    &v[7], &v[8]);
+        _sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+                                    (int)op->base.src.offset[1] + box->y1,
+                                    transform, op->base.src.scale,
+                                    &v[12], &v[13]);
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_affine_source__sse2__boxes(const struct sna_composite_spans_op *op,
+                                                               const struct sna_opacity_box *b,
+                                                               int nbox,
+                                                               float *v)
+{
+        PictTransform *transform = op->base.src.transform;
+        do {
+                v[0]  = op->base.dst.x + b->box.x2;
+                v[6]  = v[1] = op->base.dst.y + b->box.y2;
+                v[10] = v[5] = op->base.dst.x + b->box.x1;
+                v[11] = op->base.dst.y + b->box.y1;
+                v[14] = v[9] = v[4]  = b->alpha;
+                _sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x2,
+                                            (int)op->base.src.offset[1] + b->box.y2,
+                                            transform, op->base.src.scale,
+                                            &v[2], &v[3]);
+                _sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+                                            (int)op->base.src.offset[1] + b->box.y2,
+                                            transform, op->base.src.scale,
+                                            &v[7], &v[8]);
+                _sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+                                            (int)op->base.src.offset[1] + b->box.y1,
+                                            transform, op->base.src.scale,
+                                            &v[12], &v[13]);
+                v += 15;
+                b++;
+        } while (--nbox);
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_identity_gradient__sse2(struct sna *sna,
+                                                            const struct sna_composite_spans_op *op,
+                                                            const BoxRec *box,
+                                                            float opacity)
+{
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 15;
+        v[0] = op->base.dst.x + box->x2;
+        v[1] = op->base.dst.y + box->y2;
+        v[2] = op->base.src.offset[0] + box->x2;
+        v[3] = op->base.src.offset[1] + box->y2;
+        v[4] = opacity;
+        v[5] = op->base.dst.x + box->x1;
+        v[6] = v[1];
+        v[7] = op->base.src.offset[0] + box->x1;
+        v[8] = v[3];
+        v[9] = opacity;
+        v[10] = v[5];
+        v[11] = op->base.dst.y + box->y1;
+        v[12] = v[7];
+        v[13] = op->base.src.offset[1] + box->y1;
+        v[14] = opacity;
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_identity_gradient__sse2__boxes(const struct sna_composite_spans_op *op,
+                                                                   const struct sna_opacity_box *b,
+                                                                   int nbox,
+                                                                   float *v)
+{
+        do {
+                v[0] = op->base.dst.x + b->box.x2;
+                v[1] = op->base.dst.y + b->box.y2;
+                v[2] = op->base.src.offset[0] + b->box.x2;
+                v[3] = op->base.src.offset[1] + b->box.y2;
+                v[4] = b->alpha;
+                v[5] = op->base.dst.x + b->box.x1;
+                v[6] = v[1];
+                v[7] = op->base.src.offset[0] + b->box.x1;
+                v[8] = v[3];
+                v[9] = b->alpha;
+                v[10] = v[5];
+                v[11] = op->base.dst.y + b->box.y1;
+                v[12] = v[7];
+                v[13] = op->base.src.offset[1] + b->box.y1;
+                v[14] = b->alpha;
+                v += 15;
+                b++;
+        } while (--nbox);
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_affine_gradient__sse2(struct sna *sna,
+                                                          const struct sna_composite_spans_op *op,
+                                                          const BoxRec *box,
+                                                          float opacity)
+{
+        PictTransform *transform = op->base.src.transform;
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 15;
+        v[0] = op->base.dst.x + box->x2;
+        v[1] = op->base.dst.y + box->y2;
+        _sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
+                                    op->base.src.offset[1] + box->y2,
+                                    transform, op->base.src.scale,
+                                    &v[2], &v[3]);
+        v[4] = opacity;
+        v[5] = op->base.dst.x + box->x1;
+        v[6] = v[1];
+        _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+                                    op->base.src.offset[1] + box->y2,
+                                    transform, op->base.src.scale,
+                                    &v[7], &v[8]);
+        v[9] = opacity;
+        v[10] = v[5];
+        v[11] = op->base.dst.y + box->y1;
+        _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+                                    op->base.src.offset[1] + box->y1,
+                                    transform, op->base.src.scale,
+                                    &v[12], &v[13]);
+        v[14] = opacity;
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_affine_gradient__sse2__boxes(const struct sna_composite_spans_op *op,
+                                                                 const struct sna_opacity_box *b,
+                                                                 int nbox,
+                                                                 float *v)
+{
+        PictTransform *transform = op->base.src.transform;
+        do {
+                v[0] = op->base.dst.x + b->box.x2;
+                v[1] = op->base.dst.y + b->box.y2;
+                _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
+                                            op->base.src.offset[1] + b->box.y2,
+                                            transform, op->base.src.scale,
+                                            &v[2], &v[3]);
+                v[4] = b->alpha;
+                v[5] = op->base.dst.x + b->box.x1;
+                v[6] = v[1];
+                _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+                                            op->base.src.offset[1] + b->box.y2,
+                                            transform, op->base.src.scale,
+                                            &v[7], &v[8]);
+                v[9] = b->alpha;
+                v[10] = v[5];
+                v[11] = op->base.dst.y + b->box.y1;
+                _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+                                            op->base.src.offset[1] + b->box.y1,
+                                            transform, op->base.src.scale,
+                                            &v[12], &v[13]);
+                v[14] = b->alpha;
+                v += 15;
+                b++;
+        } while (--nbox);
+}
+#endif
+fastcall static void
+gen3_emit_composite_spans_primitive_affine_gradient(struct sna *sna,
+                                                    const struct sna_composite_spans_op *op,
+                                                    const BoxRec *box,
+                                                    float opacity)
+{
+        PictTransform *transform = op->base.src.transform;
+        float *v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 15;
+        v[0] = op->base.dst.x + box->x2;
+        v[1] = op->base.dst.y + box->y2;
+        _sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
+                                    op->base.src.offset[1] + box->y2,
+                                    transform, op->base.src.scale,
+                                    &v[2], &v[3]);
+        v[4] = opacity;
+        v[5] = op->base.dst.x + box->x1;
+        v[6] = v[1];
+        _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+                                    op->base.src.offset[1] + box->y2,
+                                    transform, op->base.src.scale,
+                                    &v[7], &v[8]);
+        v[9] = opacity;
+        v[10] = v[5];
+        v[11] = op->base.dst.y + box->y1;
+        _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+                                    op->base.src.offset[1] + box->y1,
+                                    transform, op->base.src.scale,
+                                    &v[12], &v[13]);
+        v[14] = opacity;
+}
+fastcall static void
+gen3_emit_composite_spans_primitive_affine_gradient__boxes(const struct sna_composite_spans_op *op,
+                                                           const struct sna_opacity_box *b,
+                                                           int nbox,
+                                                           float *v)
+{
+        PictTransform *transform = op->base.src.transform;
+        do {
+                v[0] = op->base.dst.x + b->box.x2;
+                v[1] = op->base.dst.y + b->box.y2;
+                _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
+                                            op->base.src.offset[1] + b->box.y2,
+                                            transform, op->base.src.scale,
+                                            &v[2], &v[3]);
+                v[4] = b->alpha;
+                v[5] = op->base.dst.x + b->box.x1;
+                v[6] = v[1];
+                _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+                                            op->base.src.offset[1] + b->box.y2,
+                                            transform, op->base.src.scale,
+                                            &v[7], &v[8]);
+                v[9] = b->alpha;
+                v[10] = v[5];
+                v[11] = op->base.dst.y + b->box.y1;
+                _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+                                            op->base.src.offset[1] + b->box.y1,
+                                            transform, op->base.src.scale,
+                                            &v[12], &v[13]);
+                v[14] = b->alpha;
+                v += 15;
+                b++;
+        } while (--nbox);
+}
+fastcall static void
+gen3_emit_composite_spans_primitive(struct sna *sna,
+                                    const struct sna_composite_spans_op *op,
+                                    const BoxRec *box,
+                                    float opacity)
+{
+        gen3_emit_composite_spans_vertex(sna, op,
+                                         box->x2, box->y2,
+                                         opacity);
+        gen3_emit_composite_spans_vertex(sna, op,
+                                         box->x1, box->y2,
+                                         opacity);
+        gen3_emit_composite_spans_vertex(sna, op,
+                                         box->x1, box->y1,
+                                         opacity);
+}
+fastcall static void
+gen3_render_composite_spans_constant_box(struct sna *sna,
+                                         const struct sna_composite_spans_op *op,
+                                         const BoxRec *box, float opacity)
+{
+        float *v;
+        DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
+             __FUNCTION__,
+             op->base.src.offset[0], op->base.src.offset[1],
+             opacity,
+             op->base.dst.x, op->base.dst.y,
+             box->x1, box->y1,
+             box->x2 - box->x1,
+             box->y2 - box->y1));
+        gen3_get_rectangles(sna, &op->base, 1);
+        v = sna->render.vertices + sna->render.vertex_used;
+        sna->render.vertex_used += 9;
+        v[0] = box->x2;
+        v[6] = v[3] = box->x1;
+        v[4] = v[1] = box->y2;
+        v[7] = box->y1;
+        v[8] = v[5] = v[2] = opacity;
+}
+fastcall static void
+gen3_render_composite_spans_constant_thread_boxes(struct sna *sna,
+                                                  const struct sna_composite_spans_op *op,
+                                                  const struct sna_opacity_box *box,
+                                                  int nbox)
+{
+        DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+             __FUNCTION__, nbox,
+             op->base.src.offset[0], op->base.src.offset[1],
+             op->base.dst.x, op->base.dst.y));
+        sna_vertex_lock(&sna->render);
+        do {
+                int nbox_this_time;
+                float *v;
+                nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+                assert(nbox_this_time);
+                nbox -= nbox_this_time;
+                v = sna->render.vertices + sna->render.vertex_used;
+                sna->render.vertex_used += nbox_this_time * 9;
+                sna_vertex_acquire__locked(&sna->render);
+                sna_vertex_unlock(&sna->render);
+                do {
+                        v[0] = box->box.x2;
+                        v[6] = v[3] = box->box.x1;
+                        v[4] = v[1] = box->box.y2;
+                        v[7] = box->box.y1;
+                        v[8] = v[5] = v[2] = box->alpha;
+                        v += 9;
+                        box++;
+                } while (--nbox_this_time);
+                sna_vertex_lock(&sna->render);
+                sna_vertex_release__locked(&sna->render);
+        } while (nbox);
+        sna_vertex_unlock(&sna->render);
+}
+fastcall static void
+gen3_render_composite_spans_box(struct sna *sna,
+                                const struct sna_composite_spans_op *op,
+                                const BoxRec *box, float opacity)
+{
+        DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
+             __FUNCTION__,
+             op->base.src.offset[0], op->base.src.offset[1],
+             opacity,
+             op->base.dst.x, op->base.dst.y,
+             box->x1, box->y1,
+             box->x2 - box->x1,
+             box->y2 - box->y1));
+        gen3_get_rectangles(sna, &op->base, 1);
+        op->prim_emit(sna, op, box, opacity);
+}
+static void
+gen3_render_composite_spans_boxes(struct sna *sna,
+                                  const struct sna_composite_spans_op *op,
+                                  const BoxRec *box, int nbox,
+                                  float opacity)
+{
+        DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
+             __FUNCTION__, nbox,
+             op->base.src.offset[0], op->base.src.offset[1],
+             opacity,
+             op->base.dst.x, op->base.dst.y));
+        do {
+                int nbox_this_time;
+                nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+                nbox -= nbox_this_time;
+                do {
+                        DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
+                             box->x1, box->y1,
+                             box->x2 - box->x1,
+                             box->y2 - box->y1));
+                        op->prim_emit(sna, op, box++, opacity);
+                } while (--nbox_this_time);
+        } while (nbox);
+}
+fastcall static void
+gen3_render_composite_spans_boxes__thread(struct sna *sna,
+                                          const struct sna_composite_spans_op *op,
+                                          const struct sna_opacity_box *box,
+                                          int nbox)
+{
+        DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+             __FUNCTION__, nbox,
+             op->base.src.offset[0], op->base.src.offset[1],
+             op->base.dst.x, op->base.dst.y));
+        sna_vertex_lock(&sna->render);
+        do {
+                int nbox_this_time;
+                float *v;
+                nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+                assert(nbox_this_time);
+                nbox -= nbox_this_time;
+                v = sna->render.vertices + sna->render.vertex_used;
+                sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+                sna_vertex_acquire__locked(&sna->render);
+                sna_vertex_unlock(&sna->render);
+                op->emit_boxes(op, box, nbox_this_time, v);
+                box += nbox_this_time;
+                sna_vertex_lock(&sna->render);
+                sna_vertex_release__locked(&sna->render);
+        } while (nbox);
+        sna_vertex_unlock(&sna->render);
+}
+fastcall static void
+gen3_render_composite_spans_done(struct sna *sna,
+                                 const struct sna_composite_spans_op *op)
+{
+        if (sna->render.vertex_offset)
+                gen3_vertex_flush(sna);
+        DBG(("%s()\n", __FUNCTION__));
+        if (op->base.src.bo)
+                kgem_bo_destroy(&sna->kgem, op->base.src.bo);
+        sna_render_composite_redirect_done(sna, &op->base);
+}
+static bool
+gen3_check_composite_spans(struct sna *sna,
+                           uint8_t op, PicturePtr src, PicturePtr dst,
+                           int16_t width, int16_t height, unsigned flags)
+{
+        if (op >= ARRAY_SIZE(gen3_blend_op))
+                return false;
+        if (gen3_composite_fallback(sna, op, src, NULL, dst))
+                return false;
+        if (need_tiling(sna, width, height) &&
+            !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
+                DBG(("%s: fallback, tiled operation not on GPU\n",
+                     __FUNCTION__));
+                return false;
+        }
+        return true;
+}
+static bool
+gen3_render_composite_spans(struct sna *sna,
+                            uint8_t op,
+                            PicturePtr src,
+                            PicturePtr dst,
+                            int16_t src_x,  int16_t src_y,
+                            int16_t dst_x,  int16_t dst_y,
+                            int16_t width,  int16_t height,
+                            unsigned flags,
+                            struct sna_composite_spans_op *tmp)
+{
+        bool no_offset;
+        DBG(("%s(src=(%d, %d), dst=(%d, %d), size=(%d, %d))\n", __FUNCTION__,
+             src_x, src_y, dst_x, dst_y, width, height));
+        assert(gen3_check_composite_spans(sna, op, src, dst, width, height, flags));
+        if (need_tiling(sna, width, height)) {
+                DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
+                     __FUNCTION__, width, height));
+                return sna_tiling_composite_spans(op, src, dst,
+                                                  src_x, src_y, dst_x, dst_y,
+                                                  width, height, flags, tmp);
+        }
+        if (!gen3_composite_set_target(sna, &tmp->base, dst,
+                                       dst_x, dst_y, width, height)) {
+                DBG(("%s: unable to set render target\n",
+                     __FUNCTION__));
+                return false;
+        }
+        tmp->base.op = op;
+        tmp->base.rb_reversed = gen3_dst_rb_reversed(tmp->base.dst.format);
+        if (too_large(tmp->base.dst.width, tmp->base.dst.height) ||
+            !gen3_check_pitch_3d(tmp->base.dst.bo)) {
+                if (!sna_render_composite_redirect(sna, &tmp->base,
+                                                   dst_x, dst_y, width, height,
+                                                   true))
+                        return false;
+        }
+        tmp->base.src.u.gen3.type = SHADER_TEXTURE;
+        tmp->base.src.is_affine = true;
+        DBG(("%s: preparing source\n", __FUNCTION__));
+        switch (gen3_composite_picture(sna, src, &tmp->base, &tmp->base.src,
+                                       src_x, src_y,
+                                       width, height,
+                                       dst_x, dst_y,
+                                       dst->polyMode == PolyModePrecise)) {
+        case -1:
+                goto cleanup_dst;
+        case 0:
+                tmp->base.src.u.gen3.type = SHADER_ZERO;
+                break;
+        case 1:
+                gen3_composite_channel_convert(&tmp->base.src);
+                break;
+        }
+        DBG(("%s: source type=%d\n", __FUNCTION__, tmp->base.src.u.gen3.type));
+        if (tmp->base.src.u.gen3.type != SHADER_ZERO)
+                tmp->base.mask.u.gen3.type = SHADER_OPACITY;
+        no_offset = tmp->base.dst.x == 0 && tmp->base.dst.y == 0;
+        tmp->box   = gen3_render_composite_spans_box;
+        tmp->boxes = gen3_render_composite_spans_boxes;
+        tmp->thread_boxes = gen3_render_composite_spans_boxes__thread;
+        tmp->done  = gen3_render_composite_spans_done;
+        tmp->prim_emit = gen3_emit_composite_spans_primitive;
+        switch (tmp->base.src.u.gen3.type) {
+        case SHADER_NONE:
+                assert(0);
+        case SHADER_ZERO:
+                if (no_offset) {
+                        tmp->prim_emit = gen3_emit_composite_spans_primitive_zero_no_offset;
+                        tmp->emit_boxes = gen3_emit_composite_spans_primitive_zero_no_offset__boxes;
+                } else {
+                        tmp->prim_emit = gen3_emit_composite_spans_primitive_zero;
+                        tmp->emit_boxes = gen3_emit_composite_spans_primitive_zero__boxes;
+                }
+                break;
+        case SHADER_BLACK:
+        case SHADER_WHITE:
+        case SHADER_CONSTANT:
+                if (no_offset) {
+#if defined(sse2) && !defined(__x86_64__)
+                        if (sna->cpu_features & SSE2) {
+                                tmp->box = gen3_render_composite_spans_constant_box__sse2;
+                                tmp->thread_boxes = gen3_render_composite_spans_constant_thread__sse2__boxes;
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_constant__sse2__no_offset;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__sse2__no_offset__boxes;
+                        } else
+#endif
+                        {
+                                tmp->box = gen3_render_composite_spans_constant_box;
+                                tmp->thread_boxes = gen3_render_composite_spans_constant_thread_boxes;
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_constant_no_offset;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant_no_offset__boxes;
+                        }
+                } else {
+#if defined(sse2) && !defined(__x86_64__)
+                        if (sna->cpu_features & SSE2) {
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_constant__sse2;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__sse2__boxes;
+                        } else
+#endif
+                        {
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_constant;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__boxes;
+                        }
+                }
+                break;
+        case SHADER_LINEAR:
+        case SHADER_RADIAL:
+                if (tmp->base.src.transform == NULL) {
+#if defined(sse2) && !defined(__x86_64__)
+                        if (sna->cpu_features & SSE2) {
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_gradient__sse2;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_gradient__sse2__boxes;
+                        } else
+#endif
+                        {
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_gradient;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_gradient__boxes;
+                        }
+                } else if (tmp->base.src.is_affine) {
+                        tmp->base.src.scale[1] = tmp->base.src.scale[0] = 1. / tmp->base.src.transform->matrix[2][2];
+#if defined(sse2) && !defined(__x86_64__)
+                        if (sna->cpu_features & SSE2) {
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_gradient__sse2;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_gradient__sse2__boxes;
+                        } else
+#endif
+                        {
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_gradient;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_gradient__boxes;
+                        }
+                }
+                break;
+        case SHADER_TEXTURE:
+                if (tmp->base.src.transform == NULL) {
+#if defined(sse2) && !defined(__x86_64__)
+                        if (sna->cpu_features & SSE2) {
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_source__sse2;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_source__sse2__boxes;
+                        } else
+#endif
+                        {
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_source;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_source__boxes;
+                        }
+                } else if (tmp->base.src.is_affine) {
+                        tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
+                        tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
+#if defined(sse2) && !defined(__x86_64__)
+                        if (sna->cpu_features & SSE2) {
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_source__sse2;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_source__sse2__boxes;
+                        } else
+#endif
+                        {
+                                tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_source;
+                                tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_source__boxes;
+                        }
+                }
+                break;
+        }
+        if (tmp->emit_boxes == NULL)
+                tmp->thread_boxes = NULL;
+        tmp->base.mask.bo = NULL;
+        tmp->base.floats_per_vertex = 2;
+        if (!is_constant_ps(tmp->base.src.u.gen3.type))
+                tmp->base.floats_per_vertex += tmp->base.src.is_affine ? 2 : 3;
+        tmp->base.floats_per_vertex +=
+                tmp->base.mask.u.gen3.type == SHADER_OPACITY;
+        tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
+        if (!kgem_check_bo(&sna->kgem,
+                           tmp->base.dst.bo, tmp->base.src.bo,
+                           NULL)) {
+                kgem_submit(&sna->kgem);
+                if (!kgem_check_bo(&sna->kgem,
+                                   tmp->base.dst.bo, tmp->base.src.bo,
+                                   NULL))
+                        goto cleanup_src;
+        }
+        gen3_align_vertex(sna, &tmp->base);
+        gen3_emit_composite_state(sna, &tmp->base);
+        return true;
+cleanup_src:
+        if (tmp->base.src.bo)
+                kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
+cleanup_dst:
+        if (tmp->base.redirect.real_bo)
+                kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
+        return false;
+}
+static void
+gen3_emit_video_state(struct sna *sna,
+                      struct sna_video *video,
+                      struct sna_video_frame *frame,
+                      PixmapPtr pixmap,
+                      struct kgem_bo *dst_bo,
+                      int width, int height,
+                      bool bilinear)
+{
+        struct gen3_render_state *state = &sna->render_state.gen3;
+        uint32_t id, ms3, rewind;
+        gen3_emit_target(sna, dst_bo, width, height,
+                         sna_format_for_depth(pixmap->drawable.depth));
+        /* XXX share with composite? Is it worth the effort? */
+        if ((state->last_shader & (1<<31)) == 0) {
+                OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+                          I1_LOAD_S(1) | I1_LOAD_S(2) | I1_LOAD_S(6) |
+);
+                OUT_BATCH((4 << S1_VERTEX_WIDTH_SHIFT) | (4 << S1_VERTEX_PITCH_SHIFT));
+                OUT_BATCH(S2_TEXCOORD_FMT(0, TEXCOORDFMT_2D) |
+                          S2_TEXCOORD_FMT(1, TEXCOORDFMT_NOT_PRESENT) |
+                          S2_TEXCOORD_FMT(2, TEXCOORDFMT_NOT_PRESENT) |
+                          S2_TEXCOORD_FMT(3, TEXCOORDFMT_NOT_PRESENT) |
+                          S2_TEXCOORD_FMT(4, TEXCOORDFMT_NOT_PRESENT) |
+                          S2_TEXCOORD_FMT(5, TEXCOORDFMT_NOT_PRESENT) |
+                          S2_TEXCOORD_FMT(6, TEXCOORDFMT_NOT_PRESENT) |
+                          S2_TEXCOORD_FMT(7, TEXCOORDFMT_NOT_PRESENT));
+                OUT_BATCH((2 << S6_CBUF_SRC_BLEND_FACT_SHIFT) |
+                          (1 << S6_CBUF_DST_BLEND_FACT_SHIFT) |
+                          S6_COLOR_WRITE_ENABLE);
+                state->last_blend = 0;
+                state->floats_per_vertex = 4;
+        }
+        if (!is_planar_fourcc(frame->id)) {
+                rewind = sna->kgem.nbatch;
+                OUT_BATCH(_3DSTATE_PIXEL_SHADER_CONSTANTS | 4);
+                OUT_BATCH(0x0000001);   /* constant 0 */
+                /* constant 0: brightness/contrast */
+                OUT_BATCH_F(video->brightness / 128.0);
+                OUT_BATCH_F(video->contrast / 255.0);
+                OUT_BATCH_F(0.0);
+                OUT_BATCH_F(0.0);
+                if (state->last_constants &&
+                    memcmp(&sna->kgem.batch[state->last_constants],
+                           &sna->kgem.batch[rewind],
+*sizeof(uint32_t)) == 0)
+                        sna->kgem.nbatch = rewind;
+                else
+                        state->last_constants = rewind;
+                rewind = sna->kgem.nbatch;
+                OUT_BATCH(_3DSTATE_SAMPLER_STATE | 3);
+                OUT_BATCH(0x00000001);
+                OUT_BATCH(SS2_COLORSPACE_CONVERSION |
+                          (FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
+                          (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
+                OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
+                          (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
+                          (0 << SS3_TEXTUREMAP_INDEX_SHIFT) |
+                          SS3_NORMALIZED_COORDS);
+                OUT_BATCH(0x00000000);
+                if (state->last_sampler &&
+                    memcmp(&sna->kgem.batch[state->last_sampler],
+                           &sna->kgem.batch[rewind],
+*sizeof(uint32_t)) == 0)
+                        sna->kgem.nbatch = rewind;
+                else
+                        state->last_sampler = rewind;
+                OUT_BATCH(_3DSTATE_MAP_STATE | 3);
+                OUT_BATCH(0x00000001);  /* texture map #1 */
+                OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
+                                         frame->bo,
+                                         I915_GEM_DOMAIN_SAMPLER << 16,
+));
+                ms3 = MAPSURF_422;
+                switch (frame->id) {
+                case FOURCC_YUY2:
+                        ms3 |= MT_422_YCRCB_NORMAL;
+                        break;
+                case FOURCC_UYVY:
+                        ms3 |= MT_422_YCRCB_SWAPY;
+                        break;
+                }
+                ms3 |= (frame->height - 1) << MS3_HEIGHT_SHIFT;
+                ms3 |= (frame->width - 1) << MS3_WIDTH_SHIFT;
+                OUT_BATCH(ms3);
+                OUT_BATCH(((frame->pitch[0] / 4) - 1) << MS4_PITCH_SHIFT);
+                id = 1<<31 | 1<<1 | !!video->brightness;
+                if (state->last_shader != id) {
+                        state->last_shader = id;
+                        id = sna->kgem.nbatch++;
+                        gen3_fs_dcl(FS_S0);
+                        gen3_fs_dcl(FS_T0);
+                        gen3_fs_texld(FS_OC, FS_S0, FS_T0);
+                        if (video->brightness != 0) {
+                                gen3_fs_add(FS_OC,
+                                            gen3_fs_operand_reg(FS_OC),
+                                            gen3_fs_operand(FS_C0, X, X, X, ZERO));
+                        }
+                        sna->kgem.batch[id] =
+                                _3DSTATE_PIXEL_SHADER_PROGRAM |
+                                (sna->kgem.nbatch - id - 2);
+                }
+        } else {
+                /* For the planar formats, we set up three samplers --
+                 * one for each plane, in a Y8 format.  Because I
+                 * couldn't get the special PLANAR_TO_PACKED
+                 * shader setup to work, I did the manual pixel shader:
+                 *
+                 * y' = y - .0625
+                 * u' = u - .5
+                 * v' = v - .5;
+                 *
+                 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
+                 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
+                 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
+                 *
+                 * register assignment:
+                 * r0 = (y',u',v',0)
+                 * r1 = (y,y,y,y)
+                 * r2 = (u,u,u,u)
+                 * r3 = (v,v,v,v)
+                 * OC = (r,g,b,1)
+                 */
+                rewind = sna->kgem.nbatch;
+                OUT_BATCH(_3DSTATE_PIXEL_SHADER_CONSTANTS | (22 - 2));
+                OUT_BATCH(0x000001f);   /* constants 0-4 */
+                /* constant 0: normalization offsets */
+                OUT_BATCH_F(-0.0625);
+                OUT_BATCH_F(-0.5);
+                OUT_BATCH_F(-0.5);
+                OUT_BATCH_F(0.0);
+                /* constant 1: r coefficients */
+                OUT_BATCH_F(1.1643);
+                OUT_BATCH_F(0.0);
+                OUT_BATCH_F(1.5958);
+                OUT_BATCH_F(0.0);
+                /* constant 2: g coefficients */
+                OUT_BATCH_F(1.1643);
+                OUT_BATCH_F(-0.39173);
+                OUT_BATCH_F(-0.81290);
+                OUT_BATCH_F(0.0);
+                /* constant 3: b coefficients */
+                OUT_BATCH_F(1.1643);
+                OUT_BATCH_F(2.017);
+                OUT_BATCH_F(0.0);
+                OUT_BATCH_F(0.0);
+                /* constant 4: brightness/contrast */
+                OUT_BATCH_F(video->brightness / 128.0);
+                OUT_BATCH_F(video->contrast / 255.0);
+                OUT_BATCH_F(0.0);
+                OUT_BATCH_F(0.0);
+                if (state->last_constants &&
+                    memcmp(&sna->kgem.batch[state->last_constants],
+                           &sna->kgem.batch[rewind],
+*sizeof(uint32_t)) == 0)
+                        sna->kgem.nbatch = rewind;
+                else
+                        state->last_constants = rewind;
+                rewind = sna->kgem.nbatch;
+                OUT_BATCH(_3DSTATE_SAMPLER_STATE | 9);
+                OUT_BATCH(0x00000007);
+                /* sampler 0 */
+                OUT_BATCH((FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
+                          (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
+                OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
+                          (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
+                          (0 << SS3_TEXTUREMAP_INDEX_SHIFT) |
+                          SS3_NORMALIZED_COORDS);
+                OUT_BATCH(0x00000000);
+                /* sampler 1 */
+                OUT_BATCH((FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
+                          (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
+                OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
+                          (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
+                          (1 << SS3_TEXTUREMAP_INDEX_SHIFT) |
+                          SS3_NORMALIZED_COORDS);
+                OUT_BATCH(0x00000000);
+                /* sampler 2 */
+                OUT_BATCH((FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
+                          (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
+                OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
+                          (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
+                          (2 << SS3_TEXTUREMAP_INDEX_SHIFT) |
+                          SS3_NORMALIZED_COORDS);
+                OUT_BATCH(0x00000000);
+                if (state->last_sampler &&
+                    memcmp(&sna->kgem.batch[state->last_sampler],
+                           &sna->kgem.batch[rewind],
+*sizeof(uint32_t)) == 0)
+                        sna->kgem.nbatch = rewind;
+                else
+                        state->last_sampler = rewind;
+                OUT_BATCH(_3DSTATE_MAP_STATE | 9);
+                OUT_BATCH(0x00000007);
+                OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
+                                         frame->bo,
+                                         I915_GEM_DOMAIN_SAMPLER << 16,
+));
+                ms3 = MAPSURF_8BIT | MT_8BIT_I8;
+                ms3 |= (frame->height - 1) << MS3_HEIGHT_SHIFT;
+                ms3 |= (frame->width - 1) << MS3_WIDTH_SHIFT;
+                OUT_BATCH(ms3);
+                /* check to see if Y has special pitch than normal
+                 * double u/v pitch, e.g i915 XvMC hw requires at
+                 * least 1K alignment, so Y pitch might
+                 * be same as U/V's.*/
+                if (frame->pitch[1])
+                        OUT_BATCH(((frame->pitch[1] / 4) - 1) << MS4_PITCH_SHIFT);
+                else
+                        OUT_BATCH(((frame->pitch[0] * 2 / 4) - 1) << MS4_PITCH_SHIFT);
+                OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
+                                         frame->bo,
+                                         I915_GEM_DOMAIN_SAMPLER << 16,
+                                         frame->UBufOffset));
+                ms3 = MAPSURF_8BIT | MT_8BIT_I8;
+                ms3 |= (frame->height / 2 - 1) << MS3_HEIGHT_SHIFT;
+                ms3 |= (frame->width / 2 - 1) << MS3_WIDTH_SHIFT;
+                OUT_BATCH(ms3);
+                OUT_BATCH(((frame->pitch[0] / 4) - 1) << MS4_PITCH_SHIFT);
+                OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
+                                         frame->bo,
+                                         I915_GEM_DOMAIN_SAMPLER << 16,
+                                         frame->VBufOffset));
+                ms3 = MAPSURF_8BIT | MT_8BIT_I8;
+                ms3 |= (frame->height / 2 - 1) << MS3_HEIGHT_SHIFT;
+                ms3 |= (frame->width / 2 - 1) << MS3_WIDTH_SHIFT;
+                OUT_BATCH(ms3);
+                OUT_BATCH(((frame->pitch[0] / 4) - 1) << MS4_PITCH_SHIFT);
+                id = 1<<31 | 2<<1 | !!video->brightness;
+                if (state->last_shader != id) {
+                        state->last_shader = id;
+                        id = sna->kgem.nbatch++;
+                        /* Declare samplers */
+                        gen3_fs_dcl(FS_S0);     /* Y */
+                        gen3_fs_dcl(FS_S1);     /* U */
+                        gen3_fs_dcl(FS_S2);     /* V */
+                        gen3_fs_dcl(FS_T0);     /* normalized coords */
+                        /* Load samplers to temporaries. */
+                        gen3_fs_texld(FS_R1, FS_S0, FS_T0);
+                        gen3_fs_texld(FS_R2, FS_S1, FS_T0);
+                        gen3_fs_texld(FS_R3, FS_S2, FS_T0);
+                        /* Move the sampled YUV data in R[123] to the first
+                         * 3 channels of R0.
+                         */
+                        gen3_fs_mov_masked(FS_R0, MASK_X,
+                                           gen3_fs_operand_reg(FS_R1));
+                        gen3_fs_mov_masked(FS_R0, MASK_Y,
+                                           gen3_fs_operand_reg(FS_R2));
+                        gen3_fs_mov_masked(FS_R0, MASK_Z,
+                                           gen3_fs_operand_reg(FS_R3));
+                        /* Normalize the YUV data */
+                        gen3_fs_add(FS_R0, gen3_fs_operand_reg(FS_R0),
+                                    gen3_fs_operand_reg(FS_C0));
+                        /* dot-product the YUV data in R0 by the vectors of
+                         * coefficients for calculating R, G, and B, storing
+                         * the results in the R, G, or B channels of the output
+                         * color.  The OC results are implicitly clamped
+                         * at the end of the program.
+                         */
+                        gen3_fs_dp3(FS_OC, MASK_X,
+                                    gen3_fs_operand_reg(FS_R0),
+                                    gen3_fs_operand_reg(FS_C1));
+                        gen3_fs_dp3(FS_OC, MASK_Y,
+                                    gen3_fs_operand_reg(FS_R0),
+                                    gen3_fs_operand_reg(FS_C2));
+                        gen3_fs_dp3(FS_OC, MASK_Z,
+                                    gen3_fs_operand_reg(FS_R0),
+                                    gen3_fs_operand_reg(FS_C3));
+                        /* Set alpha of the output to 1.0, by wiring W to 1
+                         * and not actually using the source.
+                         */
+                        gen3_fs_mov_masked(FS_OC, MASK_W,
+                                           gen3_fs_operand_one());
+                        if (video->brightness != 0) {
+                                gen3_fs_add(FS_OC,
+                                            gen3_fs_operand_reg(FS_OC),
+                                            gen3_fs_operand(FS_C4, X, X, X, ZERO));
+                        }
+                        sna->kgem.batch[id] =
+                                _3DSTATE_PIXEL_SHADER_PROGRAM |
+                                (sna->kgem.nbatch - id - 2);
+                }
+        }
+}
+static void
+gen3_video_get_batch(struct sna *sna, struct kgem_bo *bo)
+{
+        kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
+        if (!kgem_check_batch(&sna->kgem, 120) ||
+            !kgem_check_reloc(&sna->kgem, 4) ||
+            !kgem_check_exec(&sna->kgem, 2)) {
+                _kgem_submit(&sna->kgem);
+                _kgem_set_mode(&sna->kgem, KGEM_RENDER);
+        }
+        if (sna->render_state.gen3.need_invariant)
+                gen3_emit_invariant(sna);
+}
+static int
+gen3_get_inline_rectangles(struct sna *sna, int want, int floats_per_vertex)
+{
+        int size = floats_per_vertex * 3;
+        int rem = batch_space(sna) - 1;
+        if (size * want > rem)
+                want = rem / size;
+        return want;
+}
+static bool
+gen3_render_video(struct sna *sna,
+                  struct sna_video *video,
+                  struct sna_video_frame *frame,
+                  RegionPtr dstRegion,
+                  PixmapPtr pixmap)
+{
+        struct sna_pixmap *priv = sna_pixmap(pixmap);
+        BoxPtr pbox = REGION_RECTS(dstRegion);
+        int nbox = REGION_NUM_RECTS(dstRegion);
+        int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
+        int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
+        int src_width = frame->src.x2 - frame->src.x1;
+        int src_height = frame->src.y2 - frame->src.y1;
+        float src_offset_x, src_offset_y;
+        float src_scale_x, src_scale_y;
+        int pix_xoff, pix_yoff;
+        struct kgem_bo *dst_bo;
+        bool bilinear;
+        int copy = 0;
+        DBG(("%s: src:%dx%d (frame:%dx%d) -> dst:%dx%d\n", __FUNCTION__,
+             src_width, src_height, frame->width, frame->height, dst_width, dst_height));
+        dst_bo = priv->gpu_bo;
+        if (dst_bo == NULL)
+                return false;
+        bilinear = src_width != dst_width || src_height != dst_height;
+        src_scale_x = (float)src_width / dst_width / frame->width;
+        src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
+        src_scale_y = (float)src_height / dst_height / frame->height;
+        src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
+        DBG(("%s: src offset (%f, %f), scale (%f, %f)\n",
+             __FUNCTION__, src_offset_x, src_offset_y, src_scale_x, src_scale_y));
+        if (too_large(pixmap->drawable.width, pixmap->drawable.height) ||
+            !gen3_check_pitch_3d(dst_bo)) {
+                int bpp = pixmap->drawable.bitsPerPixel;
+                if (too_large(dst_width, dst_height))
+                        return false;
+                dst_bo = kgem_create_2d(&sna->kgem,
+                                        dst_width, dst_height, bpp,
+                                        kgem_choose_tiling(&sna->kgem,
+                                                           I915_TILING_X,
+                                                           dst_width, dst_height, bpp),
+);
+                if (!dst_bo)
+                        return false;
+                pix_xoff = -dstRegion->extents.x1;
+                pix_yoff = -dstRegion->extents.y1;
+                copy = 1;
+        } else {
+                /* Set up the offset for translating from the given region
+                 * (in screen coordinates) to the backing pixmap.
+                 */
+#ifdef COMPOSITE
+                pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+                pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+#else
+                pix_xoff = 0;
+                pix_yoff = 0;
+#endif
+                dst_width  = pixmap->drawable.width;
+                dst_height = pixmap->drawable.height;
+        }
+        gen3_video_get_batch(sna, dst_bo);
+        gen3_emit_video_state(sna, video, frame, pixmap,
+                              dst_bo, dst_width, dst_height, bilinear);
+        do {
+                int nbox_this_time = gen3_get_inline_rectangles(sna, nbox, 4);
+                if (nbox_this_time == 0) {
+                        gen3_video_get_batch(sna, dst_bo);
+                        gen3_emit_video_state(sna, video, frame, pixmap,
+                                              dst_bo, dst_width, dst_height, bilinear);
+                        nbox_this_time = gen3_get_inline_rectangles(sna, nbox, 4);
+                        assert(nbox_this_time);
+                }
+                nbox -= nbox_this_time;
+                OUT_BATCH(PRIM3D_RECTLIST | (12 * nbox_this_time - 1));
+                do {
+                        int box_x1 = pbox->x1;
+                        int box_y1 = pbox->y1;
+                        int box_x2 = pbox->x2;
+                        int box_y2 = pbox->y2;
+                        pbox++;
+                        DBG(("%s: dst (%d, %d), (%d, %d) + (%d, %d); src (%f, %f), (%f, %f)\n",
+                             __FUNCTION__, box_x1, box_y1, box_x2, box_y2, pix_xoff, pix_yoff,
+                             box_x1 * src_scale_x + src_offset_x,
+                             box_y1 * src_scale_y + src_offset_y,
+                             box_x2 * src_scale_x + src_offset_x,
+                             box_y2 * src_scale_y + src_offset_y));
+                        /* bottom right */
+                        OUT_BATCH_F(box_x2 + pix_xoff);
+                        OUT_BATCH_F(box_y2 + pix_yoff);
+                        OUT_BATCH_F(box_x2 * src_scale_x + src_offset_x);
+                        OUT_BATCH_F(box_y2 * src_scale_y + src_offset_y);
+                        /* bottom left */
+                        OUT_BATCH_F(box_x1 + pix_xoff);
+                        OUT_BATCH_F(box_y2 + pix_yoff);
+                        OUT_BATCH_F(box_x1 * src_scale_x + src_offset_x);
+                        OUT_BATCH_F(box_y2 * src_scale_y + src_offset_y);
+                        /* top left */
+                        OUT_BATCH_F(box_x1 + pix_xoff);
+                        OUT_BATCH_F(box_y1 + pix_yoff);
+                        OUT_BATCH_F(box_x1 * src_scale_x + src_offset_x);
+                        OUT_BATCH_F(box_y1 * src_scale_y + src_offset_y);
+                } while (--nbox_this_time);
+        } while (nbox);
+        if (copy) {
+#ifdef COMPOSITE
+                pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+                pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+#else
+                pix_xoff = 0;
+                pix_yoff = 0;
+#endif
+                sna_blt_copy_boxes(sna, GXcopy,
+                                   dst_bo, -dstRegion->extents.x1, -dstRegion->extents.y1,
+                                   priv->gpu_bo, pix_xoff, pix_yoff,
+                                   pixmap->drawable.bitsPerPixel,
+                                   REGION_RECTS(dstRegion),
+                                   REGION_NUM_RECTS(dstRegion));
+                kgem_bo_destroy(&sna->kgem, dst_bo);
+        }
+        if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+                if ((pix_xoff | pix_yoff) == 0) {
+                        sna_damage_add(&priv->gpu_damage, dstRegion);
+                        sna_damage_subtract(&priv->cpu_damage, dstRegion);
+                } else {
+                        sna_damage_add_boxes(&priv->gpu_damage,
+                                             REGION_RECTS(dstRegion),
+                                             REGION_NUM_RECTS(dstRegion),
+                                             pix_xoff, pix_yoff);
+                        sna_damage_subtract_boxes(&priv->cpu_damage,
+                                                  REGION_RECTS(dstRegion),
+                                                  REGION_NUM_RECTS(dstRegion),
+                                                  pix_xoff, pix_yoff);
+                }
+        }
+        return true;
+}
+#endif
 ,6 → 4565,140
+#if 0
+static bool
+gen3_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
+                     uint32_t color,
+                     int16_t x1, int16_t y1,
+                     int16_t x2, int16_t y2,
+                     uint8_t alu)
+{
+        struct sna_composite_op tmp;
+#if NO_FILL_ONE
+        return gen3_render_fill_one_try_blt(sna, dst, bo, color,
+                                            x1, y1, x2, y2, alu);
+#endif
+        /* Prefer to use the BLT if already engaged */
+        if (prefer_fill_blt(sna) &&
+            gen3_render_fill_one_try_blt(sna, dst, bo, color,
+                                         x1, y1, x2, y2, alu))
+                return true;
+        /* Must use the BLT if we can't RENDER... */
+        if (!(alu == GXcopy || alu == GXclear) ||
+            too_large(dst->drawable.width, dst->drawable.height) ||
+            bo->pitch > MAX_3D_PITCH)
+                return gen3_render_fill_one_try_blt(sna, dst, bo, color,
+                                                    x1, y1, x2, y2, alu);
+        if (alu == GXclear)
+                color = 0;
+        tmp.op = color == 0 ? PictOpClear : PictOpSrc;
+        tmp.dst.pixmap = dst;
+        tmp.dst.width = dst->drawable.width;
+        tmp.dst.height = dst->drawable.height;
+        tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
+        tmp.dst.bo = bo;
+        tmp.floats_per_vertex = 2;
+        tmp.floats_per_rect = 6;
+        tmp.need_magic_ca_pass = 0;
+        tmp.has_component_alpha = 0;
+        tmp.rb_reversed = 0;
+        gen3_init_solid(&tmp.src,
+                        sna_rgba_for_color(color, dst->drawable.depth));
+        tmp.mask.bo = NULL;
+        tmp.mask.u.gen3.type = SHADER_NONE;
+        tmp.u.gen3.num_constants = 0;
+        if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
+                kgem_submit(&sna->kgem);
+                if (gen3_render_fill_one_try_blt(sna, dst, bo, color,
+                                                 x1, y1, x2, y2, alu))
+                        return true;
+                if (!kgem_check_bo(&sna->kgem, bo, NULL))
+                        return false;
+        }
+        gen3_align_vertex(sna, &tmp);
+        gen3_emit_composite_state(sna, &tmp);
+        gen3_get_rectangles(sna, &tmp, 1);
+        DBG(("  (%d, %d), (%d, %d): %x\n", x1, y1, x2, y2, color));
+        OUT_VERTEX(x2);
+        OUT_VERTEX(y2);
+        OUT_VERTEX(x1);
+        OUT_VERTEX(y2);
+        OUT_VERTEX(x1);
+        OUT_VERTEX(y1);
+        gen3_vertex_flush(sna);
+        return true;
+}
+#endif
 static void gen3_render_flush(struct sna *sna)
 {
         gen3_vertex_close(sna);
 ,7 → 4849,7
                 kgem_submit(&sna->kgem);
         }
+        gen3_align_vertex(sna, tmp);
         gen3_emit_composite_state(sna, tmp);
-        gen3_align_vertex(sna, tmp);
         return true;
 }

 /contrib/sdk/sources/Intel-2D/sna/gen4_common.c
 ,0 → 1,64
+/*
+ * Copyright © 2011-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include "gen4_common.h"
+#include "gen4_vertex.h"
+void gen4_render_flush(struct sna *sna)
+{
+        gen4_vertex_close(sna);
+        assert(sna->render.vb_id == 0);
+        assert(sna->render.vertex_offset == 0);
+}
+void gen4_render_retire(struct kgem *kgem)
+{
+        struct sna *sna;
+        sna = container_of(kgem, struct sna, kgem);
+        if (sna->render.nvertex_reloc == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
+                DBG(("%s: resetting idle vbo\n", __FUNCTION__));
+                sna->render.vertex_used = 0;
+                sna->render.vertex_index = 0;
+        }
+}
+void gen4_render_expire(struct kgem *kgem)
+{
+        struct sna *sna;
+        sna = container_of(kgem, struct sna, kgem);
+        if (sna->render.vbo && !sna->render.vertex_used) {
+                DBG(("%s: discarding vbo\n", __FUNCTION__));
+                discard_vbo(sna);
+        }
+}

 /contrib/sdk/sources/Intel-2D/sna/gen4_common.h
 ,0 → 1,49
+/*
+ * Copyright © 2011-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+#ifndef GEN4_COMMON_H
+#define GEN4_COMMON_H
+#include "sna.h"
+inline static void
+discard_vbo(struct sna *sna)
+{
+        kgem_bo_destroy(&sna->kgem, sna->render.vbo);
+        sna->render.vbo = NULL;
+        sna->render.vertices = sna->render.vertex_data;
+        sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+        sna->render.vertex_used = 0;
+        sna->render.vertex_index = 0;
+}
+void gen4_render_flush(struct sna *sna);
+void gen4_render_retire(struct kgem *kgem);
+void gen4_render_expire(struct kgem *kgem);
+#endif /* GEN4_COMMON_H */

 /contrib/sdk/sources/Intel-2D/sna/gen4_render.c
 ,6 → 41,7
 //#include "sna_video.h"
 #include "brw/brw.h"
+#include "gen4_common.h"
 #include "gen4_render.h"
 #include "gen4_source.h"
 #include "gen4_vertex.h"
 ,9 → 550,6
         if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
                 return 0;
-        if (op->need_magic_ca_pass && sna->render.vbo)
-                return 0;
         if (sna->render.vertex_offset) {
                 gen4_vertex_flush(sna);
                 if (gen4_magic_ca_pass(sna, op))
 ,16 → 745,10
 {
         assert(op->floats_per_rect == 3*op->floats_per_vertex);
         if (op->floats_per_vertex != sna->render_state.gen4.floats_per_vertex) {
-                if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-                        gen4_vertex_finish(sna);
-                DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
+                DBG(("aligning vertex: was %d, now %d floats per vertex\n",
                      sna->render_state.gen4.floats_per_vertex,
-                     op->floats_per_vertex,
-                     sna->render.vertex_index,
-                     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
-                sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
-                sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
+                     op->floats_per_vertex));
+                gen4_vertex_align(sna, op);
                 sna->render_state.gen4.floats_per_vertex = op->floats_per_vertex;
         }
 }
 ,11 → 1306,12
         if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
                 kgem_submit(&sna->kgem);
-                assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
+                if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
+                        return false;
         }
+        gen4_align_vertex(sna, &tmp);
         gen4_video_bind_surfaces(sna, &tmp);
-        gen4_align_vertex(sna, &tmp);
         /* Set up the offset for translating from the given region (in screen
          * coordinates) to the backing pixmap.
 ,33 → 1542,6
 }
 static bool
-try_blt(struct sna *sna,
-        PicturePtr dst, PicturePtr src,
-        int width, int height)
-{
-        if (sna->kgem.mode != KGEM_RENDER) {
-                DBG(("%s: already performing BLT\n", __FUNCTION__));
-                return true;
-        }
-        if (too_large(width, height)) {
-                DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
-                     __FUNCTION__, width, height));
-                return true;
-        }
-        if (too_large(dst->pDrawable->width, dst->pDrawable->height))
-                return true;
-        /* The blitter is much faster for solids */
-        if (sna_picture_is_solid(src, NULL))
-                return true;
-        /* is the source picture only in cpu memory e.g. a shm pixmap? */
-        return picture_is_cpu(sna, src);
-}
-static bool
 check_gradient(PicturePtr picture, bool precise)
 {
         switch (picture->pSourcePict->type) {
 ,7 → 1769,6
                 return false;
         if (mask == NULL &&
-            try_blt(sna, dst, src, width, height) &&
             sna_blt_composite(sna, op,
                               src, dst,
                               src_x, src_y,
 ,8 → 1897,8
                         goto cleanup_mask;
         }
+        gen4_align_vertex(sna, tmp);
         gen4_bind_surfaces(sna, tmp);
-        gen4_align_vertex(sna, tmp);
         return true;
 cleanup_mask:
 ,51 → 1955,6
-static void
-gen4_render_flush(struct sna *sna)
-{
-        gen4_vertex_close(sna);
-        assert(sna->render.vb_id == 0);
-        assert(sna->render.vertex_offset == 0);
-}
-static void
-discard_vbo(struct sna *sna)
-{
-        kgem_bo_destroy(&sna->kgem, sna->render.vbo);
-        sna->render.vbo = NULL;
-        sna->render.vertices = sna->render.vertex_data;
-        sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-        sna->render.vertex_used = 0;
-        sna->render.vertex_index = 0;
-}
-static void
-gen4_render_retire(struct kgem *kgem)
-{
-        struct sna *sna;
-        sna = container_of(kgem, struct sna, kgem);
-        if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
-                DBG(("%s: resetting idle vbo\n", __FUNCTION__));
-                sna->render.vertex_used = 0;
-                sna->render.vertex_index = 0;
-        }
-}
-static void
-gen4_render_expire(struct kgem *kgem)
-{
-        struct sna *sna;
-        sna = container_of(kgem, struct sna, kgem);
-        if (sna->render.vbo && !sna->render.vertex_used) {
-                DBG(("%s: discarding vbo\n", __FUNCTION__));
-                discard_vbo(sna);
-        }
-}
 static void gen4_render_reset(struct sna *sna)
 {
         sna->render_state.gen4.needs_invariant = true;
 ,8 → 1967,7
         sna->render_state.gen4.drawrect_limit = -1;
         sna->render_state.gen4.surface_table = -1;
-        if (sna->render.vbo &&
-            !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
+        if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
                 DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
                 discard_vbo(sna);
         }
 ,8 → 2326,8
                 kgem_submit(&sna->kgem);
         }
+        gen4_align_vertex(sna, tmp);
         gen4_bind_surfaces(sna, tmp);
-        gen4_align_vertex(sna, tmp);
         return true;
 }

 /contrib/sdk/sources/Intel-2D/sna/gen4_vertex.c
 ,6 → 38,29
 #define sse2
 #endif
+void gen4_vertex_align(struct sna *sna, const struct sna_composite_op *op)
+{
+        int vertex_index;
+        assert(op->floats_per_rect == 3*op->floats_per_vertex);
+        vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
+        if ((int)sna->render.vertex_size - vertex_index * op->floats_per_vertex < 2*op->floats_per_rect) {
+                DBG(("%s: flushing vertex buffer: new index=%d, max=%d\n",
+                     __FUNCTION__, vertex_index, sna->render.vertex_size / op->floats_per_vertex));
+                if (gen4_vertex_finish(sna) < op->floats_per_rect) {
+                        kgem_submit(&sna->kgem);
+                        _kgem_set_mode(&sna->kgem, KGEM_RENDER);
+                }
+                vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
+                assert(vertex_index * op->floats_per_vertex <= sna->render.vertex_size);
+        }
+        sna->render.vertex_index = vertex_index;
+        sna->render.vertex_used = vertex_index * op->floats_per_vertex;
+}
 void gen4_vertex_flush(struct sna *sna)
 {
     DBG(("%s[%x] = %d\n", __FUNCTION__,
 ,7 → 68,9
          sna->render.vertex_index - sna->render.vertex_start));
     assert(sna->render.vertex_offset);
+        assert(sna->render.vertex_offset <= sna->kgem.nbatch);
     assert(sna->render.vertex_index > sna->render.vertex_start);
+        assert(sna->render.vertex_used <= sna->render.vertex_size);
     sna->kgem.batch[sna->render.vertex_offset] =
         sna->render.vertex_index - sna->render.vertex_start;
 ,11 → 87,14
          sna->render.vertex_used, sna->render.vertex_size));
     assert(sna->render.vertex_offset == 0);
     assert(sna->render.vertex_used);
+        assert(sna->render.vertex_used <= sna->render.vertex_size);
         sna_vertex_wait__locked(&sna->render);
     /* Note: we only need dword alignment (currently) */
+        hint = CREATE_GTT_MAP;
     bo = sna->render.vbo;
     if (bo) {
         for (i = 0; i < sna->render.nvertex_reloc; i++) {
 ,11 → 116,15
         sna->render.vb_id = 0;
         kgem_bo_destroy(&sna->kgem, bo);
+                hint |= CREATE_CACHED | CREATE_NO_THROTTLE;
+        } else {
+                if (kgem_is_idle(&sna->kgem)) {
+                        sna->render.vertices = sna->render.vertex_data;
+                        sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+                        return 0;
+                }
     }
-    hint = CREATE_GTT_MAP;
-    if (bo)
-        hint |= CREATE_CACHED | CREATE_NO_THROTTLE;
     size = 256*1024;
     assert(!sna->render.active);
 ,7 → 195,7
             sna->render.vertices = sna->render.vertex_data;
             sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
             free_bo = bo;
-        } else if (IS_CPU_MAP(bo->map) && !sna->kgem.has_llc) {
+                } else if (!sna->kgem.has_llc && sna->render.vertices == MAP(bo->map__cpu)) {
             DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
             sna->render.vertices =
                 kgem_bo_map__gtt(&sna->kgem, sna->render.vbo);
 ,9 → 208,16
         }
     } else {
-        if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
+                int size;
+                size  = sna->kgem.nbatch;
+                size += sna->kgem.batch_size - sna->kgem.surface;
+                size += sna->render.vertex_used;
+                if (size <= 1024) {
             DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
                  sna->render.vertex_used, sna->kgem.nbatch));
+                        assert(sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface);
             memcpy(sna->kgem.batch + sna->kgem.nbatch,
                    sna->render.vertex_data,
                    sna->render.vertex_used * 4);
 ,6 → 225,37
             bo = NULL;
             sna->kgem.nbatch += sna->render.vertex_used;
         } else {
+                        size = 256 * 1024;
+                        do {
+                                bo = kgem_create_linear(&sna->kgem, size,
+                                                        CREATE_GTT_MAP | CREATE_NO_RETIRE | CREATE_NO_THROTTLE | CREATE_CACHED);
+                        } while (bo == NULL && (size>>=1) > sizeof(float)*sna->render.vertex_used);
+                        sna->render.vertices = NULL;
+                        if (bo)
+                                sna->render.vertices = kgem_bo_map(&sna->kgem, bo);
+                        if (sna->render.vertices != NULL) {
+                                DBG(("%s: new vbo: %d / %d\n", __FUNCTION__,
+                                     sna->render.vertex_used, __kgem_bo_size(bo)/4));
+                                assert(sizeof(float)*sna->render.vertex_used <= __kgem_bo_size(bo));
+                                memcpy(sna->render.vertices,
+                                       sna->render.vertex_data,
+                                       sizeof(float)*sna->render.vertex_used);
+                                size = __kgem_bo_size(bo)/4;
+                                if (size >= UINT16_MAX)
+                                        size = UINT16_MAX - 1;
+                                sna->render.vbo = bo;
+                                sna->render.vertex_size = size;
+                        } else {
+                                DBG(("%s: tmp vbo: %d\n", __FUNCTION__,
+                                     sna->render.vertex_used));
+                                if (bo)
+                                        kgem_bo_destroy(&sna->kgem, bo);
             bo = kgem_create_linear(&sna->kgem,
 *sna->render.vertex_used,
                         CREATE_NO_THROTTLE);
 ,11 → 265,14
                 kgem_bo_destroy(&sna->kgem, bo);
                 bo = NULL;
             }
-            DBG(("%s: new vbo: %d\n", __FUNCTION__,
-                 sna->render.vertex_used));
+                                assert(sna->render.vbo == NULL);
+                                sna->render.vertices = sna->render.vertex_data;
+                                sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
             free_bo = bo;
         }
     }
+        }
     assert(sna->render.nvertex_reloc);
     for (i = 0; i < sna->render.nvertex_reloc; i++) {

/contrib/sdk/sources/Intel-2D/sna/gen4_vertex.h
6,6 → 6,7
#include "sna.h"
#include "sna_render.h"

void gen4_vertex_align(struct sna sna, const struct sna_composite_op op);
void gen4_vertex_flush(struct sna *sna);
int gen4_vertex_finish(struct sna *sna);
void gen4_vertex_close(struct sna *sna);

 /contrib/sdk/sources/Intel-2D/sna/gen5_render.c
 ,6 → 42,7
 #include "brw/brw.h"
 #include "gen5_render.h"
+#include "gen4_common.h"
 #include "gen4_source.h"
 #include "gen4_vertex.h"
 ,16 → 720,10
 {
         assert(op->floats_per_rect == 3*op->floats_per_vertex);
         if (op->floats_per_vertex != sna->render_state.gen5.floats_per_vertex) {
-                if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-                        gen4_vertex_finish(sna);
-                DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
+                DBG(("aligning vertex: was %d, now %d floats per vertex\n",
                      sna->render_state.gen5.floats_per_vertex,
-                     op->floats_per_vertex,
-                     sna->render.vertex_index,
-                     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
-                sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
-                sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
+                     op->floats_per_vertex));
+                gen4_vertex_align(sna, op);
                 sna->render_state.gen5.floats_per_vertex = op->floats_per_vertex;
         }
 }
 ,10 → 937,14
 inline static void
 gen5_emit_pipe_flush(struct sna *sna)
 {
+#if 0
         OUT_BATCH(GEN5_PIPE_CONTROL | (4 - 2));
         OUT_BATCH(GEN5_PIPE_CONTROL_WC_FLUSH);
         OUT_BATCH(0);
         OUT_BATCH(0);
+#else
+        OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
+#endif
 }
 static void
 ,11 → 1310,12
         if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
                 kgem_submit(&sna->kgem);
-                assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
+                if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
+                        return false;
         }
+        gen5_align_vertex(sna, &tmp);
         gen5_video_bind_surfaces(sna, &tmp);
-        gen5_align_vertex(sna, &tmp);
         /* Set up the offset for translating from the given region (in screen
          * coordinates) to the backing pixmap.
 ,7 → 1452,6
         }
         if (mask == NULL &&
-            try_blt(sna, dst, src, width, height) &&
             sna_blt_composite(sna, op,
                               src, dst,
                               src_x, src_y,
 ,8 → 1576,8
                         goto cleanup_mask;
         }
+        gen5_align_vertex(sna, tmp);
         gen5_bind_surfaces(sna, tmp);
-        gen5_align_vertex(sna, tmp);
         return true;
 cleanup_mask:
 ,8 → 1805,8
                         goto cleanup_src;
         }
+        gen5_align_vertex(sna, &tmp->base);
         gen5_bind_surfaces(sna, &tmp->base);
-        gen5_align_vertex(sna, &tmp->base);
         return true;
 cleanup_src:
 ,7 → 1951,10
                 kgem_submit(&sna->kgem);
                 if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
                         DBG(("%s: aperture check failed\n", __FUNCTION__));
-                        goto fallback_tiled_src;
+                        kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+                        if (tmp.redirect.real_bo)
+                                kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
+                        goto fallback_blt;
                 }
         }
 ,8 → 1965,8
         src_dx += tmp.src.offset[0];
         src_dy += tmp.src.offset[1];
+        gen5_align_vertex(sna, &tmp);
         gen5_copy_bind_surfaces(sna, &tmp);
-        gen5_align_vertex(sna, &tmp);
         do {
                 int n_this_time;
 ,8 → 2001,6
         kgem_bo_destroy(&sna->kgem, tmp.src.bo);
         return true;
-fallback_tiled_src:
-        kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 fallback_tiled_dst:
         if (tmp.redirect.real_bo)
                 kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
 ,17 → 2021,7
 }
 #endif
 static void
-gen5_render_flush(struct sna *sna)
-{
-        gen4_vertex_close(sna);
-        assert(sna->render.vb_id == 0);
-        assert(sna->render.vertex_offset == 0);
-}
-static void
 gen5_render_context_switch(struct kgem *kgem,
                            int new_mode)
 {
 ,42 → 2050,6
         }
 }
-static void
-discard_vbo(struct sna *sna)
-{
-        kgem_bo_destroy(&sna->kgem, sna->render.vbo);
-        sna->render.vbo = NULL;
-        sna->render.vertices = sna->render.vertex_data;
-        sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-        sna->render.vertex_used = 0;
-        sna->render.vertex_index = 0;
-}
-static void
-gen5_render_retire(struct kgem *kgem)
-{
-        struct sna *sna;
-        sna = container_of(kgem, struct sna, kgem);
-        if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
-                DBG(("%s: resetting idle vbo\n", __FUNCTION__));
-                sna->render.vertex_used = 0;
-                sna->render.vertex_index = 0;
-        }
-}
-static void
-gen5_render_expire(struct kgem *kgem)
-{
-        struct sna *sna;
-        sna = container_of(kgem, struct sna, kgem);
-        if (sna->render.vbo && !sna->render.vertex_used) {
-                DBG(("%s: discarding vbo\n", __FUNCTION__));
-                discard_vbo(sna);
-        }
-}
 static void gen5_render_reset(struct sna *sna)
 {
         sna->render_state.gen5.needs_invariant = true;
 ,8 → 2061,7
         sna->render_state.gen5.drawrect_limit = -1;
         sna->render_state.gen5.surface_table = -1;
-        if (sna->render.vbo &&
-            !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
+        if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
                 DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
                 discard_vbo(sna);
         }
 ,8 → 2304,8
                 return backend;
         sna->kgem.context_switch = gen5_render_context_switch;
-        sna->kgem.retire = gen5_render_retire;
-        sna->kgem.expire = gen5_render_expire;
+        sna->kgem.retire = gen4_render_retire;
+        sna->kgem.expire = gen4_render_expire;
 #if 0
 #if !NO_COMPOSITE
 ,7 → 2315,7
 #if !NO_COMPOSITE_SPANS
         sna->render.check_composite_spans = gen5_check_composite_spans;
         sna->render.composite_spans = gen5_render_composite_spans;
-        if (sna->PciInfo->device_id == 0x0044)
+        if (intel_get_device_id(sna->scrn) == 0x0044)
                 sna->render.prefer_gpu |= PREFER_GPU_SPANS;
 #endif
         sna->render.video = gen5_render_video;
 ,7 → 2331,7
     sna->render.blit_tex = gen5_blit_tex;
     sna->render.caps = HW_BIT_BLIT | HW_TEX_BLIT;
-        sna->render.flush = gen5_render_flush;
+        sna->render.flush = gen4_render_flush;
         sna->render.reset = gen5_render_reset;
         sna->render.fini = gen5_render_fini;
 ,8 → 2419,8
                 kgem_submit(&sna->kgem);
         }
+        gen5_align_vertex(sna, tmp);
         gen5_bind_surfaces(sna, tmp);
-        gen5_align_vertex(sna, tmp);
         return true;
 }

 /contrib/sdk/sources/Intel-2D/sna/gen6_common.c
 ,0 → 1,71
+/*
+ * Copyright © 2011-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include "gen6_common.h"
+#include "gen4_vertex.h"
+void
+gen6_render_context_switch(struct kgem *kgem,
+                           int new_mode)
+{
+        if (kgem->nbatch) {
+                DBG(("%s: from %d to %d, submit batch\n", __FUNCTION__, kgem->mode, new_mode));
+                _kgem_submit(kgem);
+        }
+        if (kgem->nexec) {
+                DBG(("%s: from %d to %d, reset incomplete batch\n", __FUNCTION__, kgem->mode, new_mode));
+                kgem_reset(kgem);
+        }
+        assert(kgem->nbatch == 0);
+        assert(kgem->nreloc == 0);
+        assert(kgem->nexec == 0);
+        kgem->ring = new_mode;
+}
+void gen6_render_retire(struct kgem *kgem)
+{
+        struct sna *sna;
+        if (kgem->ring && (kgem->has_semaphores || !kgem->need_retire))
+                kgem->ring = kgem->mode;
+        sna = container_of(kgem, struct sna, kgem);
+        if (sna->render.nvertex_reloc == 0 &&
+            sna->render.vbo &&
+            !kgem_bo_is_busy(sna->render.vbo)) {
+                DBG(("%s: resetting idle vbo\n", __FUNCTION__));
+                sna->render.vertex_used = 0;
+                sna->render.vertex_index = 0;
+        }
+}

 /contrib/sdk/sources/Intel-2D/sna/gen6_common.h
 ,0 → 1,139
+/*
+ * Copyright © 2011-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+#ifndef GEN6_COMMON_H
+#define GEN6_COMMON_H
+#include "sna.h"
+#define NO_RING_SWITCH 0
+#define PREFER_RENDER 0
+static inline bool is_uncached(struct sna *sna,
+                               struct kgem_bo *bo)
+{
+        return bo->scanout && !sna->kgem.has_wt;
+}
+inline static bool can_switch_to_blt(struct sna *sna,
+                                     struct kgem_bo *bo,
+                                     unsigned flags)
+{
+    return false;
+}
+inline static bool can_switch_to_render(struct sna *sna,
+                                        struct kgem_bo *bo)
+{
+        if (sna->kgem.ring == KGEM_RENDER)
+                return true;
+        if (NO_RING_SWITCH)
+                return false;
+        if (!sna->kgem.has_semaphores)
+                return false;
+        if (bo && !RQ_IS_BLT(bo->rq) && !is_uncached(sna, bo))
+                return true;
+        return !kgem_ring_is_idle(&sna->kgem, KGEM_RENDER);
+}
+static inline bool untiled_tlb_miss(struct kgem_bo *bo)
+{
+        if (kgem_bo_is_render(bo))
+                return false;
+        return bo->tiling == I915_TILING_NONE && bo->pitch >= 4096;
+}
+static int prefer_blt_bo(struct sna *sna, struct kgem_bo *bo)
+{
+        if (bo->rq)
+                return RQ_IS_BLT(bo->rq);
+        if (sna->flags & SNA_POWERSAVE)
+                return true;
+        return bo->tiling == I915_TILING_NONE || is_uncached(sna, bo);
+}
+inline static bool force_blt_ring(struct sna *sna)
+{
+        if (sna->flags & SNA_POWERSAVE)
+                return true;
+        if (sna->kgem.mode == KGEM_RENDER)
+                return false;
+        if (sna->render_state.gt < 2)
+                return true;
+        return false;
+}
+inline static bool prefer_blt_ring(struct sna *sna,
+                                   struct kgem_bo *bo,
+                                   unsigned flags)
+{
+        assert(!force_blt_ring(sna));
+        assert(!kgem_bo_is_render(bo));
+        return can_switch_to_blt(sna, bo, flags);
+}
+inline static bool prefer_render_ring(struct sna *sna,
+                                      struct kgem_bo *bo)
+{
+        if (sna->flags & SNA_POWERSAVE)
+                return false;
+        if (sna->render_state.gt < 2)
+                return false;
+        return can_switch_to_render(sna, bo);
+}
+inline static bool
+prefer_blt_composite(struct sna *sna, struct sna_composite_op *tmp)
+{
+    return false;
+}
+static inline bool prefer_blt_fill(struct sna *sna,
+                                   struct kgem_bo *bo,
+                                   unsigned flags)
+{
+        return false;
+}
+void gen6_render_context_switch(struct kgem *kgem, int new_mode);
+void gen6_render_retire(struct kgem *kgem);
+#endif /* GEN6_COMMON_H */

 /contrib/sdk/sources/Intel-2D/sna/gen6_render.c
 ,6 → 39,8
 #include "brw/brw.h"
 #include "gen6_render.h"
+#include "gen6_common.h"
+#include "gen4_common.h"
 #include "gen4_source.h"
 #include "gen4_vertex.h"
 ,6 → 76,7
                 int max_vs_entries;
                 int max_gs_entries;
         } urb;
+        int gt;
 };
 static const struct gt_info gt1_info = {
 ,6 → 85,7
         .max_gs_threads = 21,
         .max_wm_threads = 40,
         .urb = { 32, 256, 256 },
+        .gt = 1,
 };
 static const struct gt_info gt2_info = {
 ,6 → 94,7
         .max_gs_threads = 60,
         .max_wm_threads = 80,
         .urb = { 64, 256, 256 },
+        .gt = 2,
 };
 static const uint32_t ps_kernel_packed[][4] = {
 ,21 → 877,22
                 const struct sna_composite_op *op,
                 uint16_t wm_binding_table)
 {
-        bool need_stall = wm_binding_table & 1;
+        bool need_flush, need_stall;
         assert(op->dst.bo->exec);
-        if (gen6_emit_cc(sna, GEN6_BLEND(op->u.gen6.flags)))
-                need_stall = false;
+        need_flush =
+                gen6_emit_cc(sna, GEN6_BLEND(op->u.gen6.flags)) &&
+                wm_binding_table & 1;
         gen6_emit_sampler(sna, GEN6_SAMPLER(op->u.gen6.flags));
         gen6_emit_sf(sna, GEN6_VERTEX(op->u.gen6.flags) >> 2);
         gen6_emit_wm(sna, GEN6_KERNEL(op->u.gen6.flags), GEN6_VERTEX(op->u.gen6.flags) >> 2);
         gen6_emit_vertex_elements(sna, op);
-        need_stall |= gen6_emit_binding_table(sna, wm_binding_table & ~1);
+        need_stall = gen6_emit_binding_table(sna, wm_binding_table & ~1);
         if (gen6_emit_drawing_rectangle(sna, op))
                 need_stall = false;
-        if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
+        if (need_flush || kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
         gen6_emit_flush(sna);
         kgem_clear_dirty(&sna->kgem);
                 assert(op->dst.bo->exec);
 ,16 → 1323,10
 {
         assert (sna->render.vertex_offset == 0);
         if (op->floats_per_vertex != sna->render_state.gen6.floats_per_vertex) {
-                if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-                        gen4_vertex_finish(sna);
-                DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
+                DBG(("aligning vertex: was %d, now %d floats per vertex\n",
                      sna->render_state.gen6.floats_per_vertex,
-                     op->floats_per_vertex,
-                     sna->render.vertex_index,
-                     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
-                sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
-                sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
+                     op->floats_per_vertex));
+                gen4_vertex_align(sna, op);
                 sna->render_state.gen6.floats_per_vertex = op->floats_per_vertex;
         }
         assert((sna->render.vertex_used % op->floats_per_vertex) == 0);
 ,8 → 1657,8
                 _kgem_set_mode(&sna->kgem, KGEM_RENDER);
         }
+        gen6_align_vertex(sna, &tmp);
         gen6_emit_video_state(sna, &tmp);
-        gen6_align_vertex(sna, &tmp);
         /* Set up the offset for translating from the given region (in screen
          * coordinates) to the backing pixmap.
 ,9 → 1853,9
         } else
                 sna_render_picture_extents(dst, &box);
-//      op->dst.bo = sna_drawable_use_bo (dst->pDrawable,
-//                                        PREFER_GPU | FORCE_GPU | RENDER_GPU,
-//                                        &box, &op->damage);
+        op->dst.bo = sna_drawable_use_bo(dst->pDrawable,
+                                         PREFER_GPU | FORCE_GPU | RENDER_GPU,
+                                         &box, &op->damage);
         if (op->dst.bo == NULL)
                 return false;
 ,7 → 1925,13
                 return true;
         if (gen6_composite_fallback(sna, src, mask, dst))
-                return false;
+                return (mask == NULL &&
+                        sna_blt_composite(sna, op,
+                                          src, dst,
+                                          src_x, src_y,
+                                          dst_x, dst_y,
+                                          width, height,
+                                          tmp, true));
         if (need_tiling(sna, width, height))
                 return sna_tiling_composite(op, src, mask, dst,
 ,8 → 2057,8
                 _kgem_set_mode(&sna->kgem, KGEM_RENDER);
         }
+        gen6_align_vertex(sna, tmp);
     gen6_emit_composite_state(sna, tmp);
-    gen6_align_vertex(sna, tmp);
         return true;
 cleanup_mask:
 ,8 → 2290,8
                 _kgem_set_mode(&sna->kgem, KGEM_RENDER);
         }
+        gen6_align_vertex(sna, &tmp->base);
         gen6_emit_composite_state(sna, &tmp->base);
-        gen6_align_vertex(sna, &tmp->base);
         return true;
 cleanup_src:
 ,10 → 2357,16
             untiled_tlb_miss(dst_bo))
                 return true;
+        if (force_blt_ring(sna))
+                return true;
         if (kgem_bo_is_render(dst_bo) ||
             kgem_bo_is_render(src_bo))
                 return false;
+        if (prefer_render_ring(sna, dst_bo))
+                return false;
         if (!prefer_blt_ring(sna, dst_bo, flags))
                 return false;
 ,13 → 2565,17
                 if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
                         DBG(("%s: too large for a single operation\n",
                              __FUNCTION__));
-                        goto fallback_tiled_src;
+                        if (tmp.src.bo != src_bo)
+                                kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+                        if (tmp.redirect.real_bo)
+                                kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
+                        goto fallback_blt;
                 }
                 _kgem_set_mode(&sna->kgem, KGEM_RENDER);
         }
+        gen6_align_vertex(sna, &tmp);
         gen6_emit_copy_state(sna, &tmp);
-        gen6_align_vertex(sna, &tmp);
         do {
                 int16_t *v;
 ,9 → 2612,6
                 kgem_bo_destroy(&sna->kgem, tmp.src.bo);
         return true;
-fallback_tiled_src:
-        if (tmp.src.bo != src_bo)
-                kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 fallback_tiled_dst:
         if (tmp.redirect.real_bo)
                 kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
 ,8 → 2733,8
                 _kgem_set_mode(&sna->kgem, KGEM_RENDER);
         }
+        gen6_align_vertex(sna, &op->base);
         gen6_emit_copy_state(sna, &op->base);
-        gen6_align_vertex(sna, &op->base);
         op->blt  = gen6_render_copy_blt;
         op->done = gen6_render_copy_done;
 ,24 → 2773,6
         gen6_emit_state(sna, op, offset | dirty);
 }
-static inline bool prefer_blt_fill(struct sna *sna,
-                                   struct kgem_bo *bo)
-{
-        if (PREFER_RENDER)
-                return PREFER_RENDER < 0;
-        if (kgem_bo_is_render(bo))
-                return false;
-        if (untiled_tlb_miss(bo))
-                return true;
-        if (!prefer_blt_ring(sna, bo, 0))
-                return false;
-        return prefer_blt_bo(sna, bo);
-}
 static bool
 gen6_render_fill_boxes(struct sna *sna,
                        CARD8 op,
 ,7 → 2794,8
                 return false;
         }
-        if (prefer_blt_fill(sna, dst_bo) || !gen6_check_dst_format(format)) {
+        if (prefer_blt_fill(sna, dst_bo, FILL_BOXES) ||
+            !gen6_check_dst_format(format)) {
                 uint8_t alu = GXinvalid;
                 if (op <= PictOpSrc) {
 ,13 → 2870,14
         assert(GEN6_SAMPLER(tmp.u.gen6.flags) == FILL_SAMPLER);
         assert(GEN6_VERTEX(tmp.u.gen6.flags) == FILL_VERTEX);
+        kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
         if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
                 kgem_submit(&sna->kgem);
                 assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
         }
+        gen6_align_vertex(sna, &tmp);
         gen6_emit_fill_state(sna, &tmp);
-        gen6_align_vertex(sna, &tmp);
         do {
                 int n_this_time;
 ,12 → 3006,12
 static bool
 gen6_render_fill(struct sna *sna, uint8_t alu,
                  PixmapPtr dst, struct kgem_bo *dst_bo,
-                 uint32_t color,
+                 uint32_t color, unsigned flags,
                  struct sna_fill_op *op)
 {
         DBG(("%s: (alu=%d, color=%x)\n", __FUNCTION__, alu, color));
-        if (prefer_blt_fill(sna, dst_bo) &&
+        if (prefer_blt_fill(sna, dst_bo, flags) &&
             sna_blt_fill(sna, alu,
                          dst_bo, dst->drawable.bitsPerPixel,
                          color,
 ,13 → 3050,14
         assert(GEN6_SAMPLER(op->base.u.gen6.flags) == FILL_SAMPLER);
         assert(GEN6_VERTEX(op->base.u.gen6.flags) == FILL_VERTEX);
+        kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
         if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
                 kgem_submit(&sna->kgem);
                 assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
         }
+        gen6_align_vertex(sna, &op->base);
         gen6_emit_fill_state(sna, &op->base);
-        gen6_align_vertex(sna, &op->base);
         op->blt  = gen6_render_op_fill_blt;
         op->box  = gen6_render_op_fill_box;
 ,7 → 3095,7
         int16_t *v;
         /* Prefer to use the BLT if already engaged */
-        if (prefer_blt_fill(sna, bo) &&
+        if (prefer_blt_fill(sna, bo, FILL_BOXES) &&
             gen6_render_fill_one_try_blt(sna, dst, bo, color,
                                          x1, y1, x2, y2, alu))
                 return true;
 ,6 → 3131,7
         assert(GEN6_SAMPLER(tmp.u.gen6.flags) == FILL_SAMPLER);
         assert(GEN6_VERTEX(tmp.u.gen6.flags) == FILL_VERTEX);
+        kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
         if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
                 kgem_submit(&sna->kgem);
                 if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 ,8 → 3140,8
                 }
         }
+        gen6_align_vertex(sna, &tmp);
         gen6_emit_fill_state(sna, &tmp);
-        gen6_align_vertex(sna, &tmp);
         gen6_get_rectangles(sna, &tmp, 1, gen6_emit_fill_state);
 ,6 → 3218,7
         assert(GEN6_SAMPLER(tmp.u.gen6.flags) == FILL_SAMPLER);
         assert(GEN6_VERTEX(tmp.u.gen6.flags) == FILL_VERTEX);
+        kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
         if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
                 kgem_submit(&sna->kgem);
                 if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 ,8 → 3227,8
                 }
         }
+        gen6_align_vertex(sna, &tmp);
         gen6_emit_fill_state(sna, &tmp);
-        gen6_align_vertex(sna, &tmp);
         gen6_get_rectangles(sna, &tmp, 1, gen6_emit_fill_state);
 ,60 → 3251,6
 }
 #endif
-static void gen6_render_flush(struct sna *sna)
-{
-        gen4_vertex_close(sna);
-        assert(sna->render.vb_id == 0);
-        assert(sna->render.vertex_offset == 0);
-}
-static void
-gen6_render_context_switch(struct kgem *kgem,
-                           int new_mode)
-{
-        if (kgem->nbatch) {
-                DBG(("%s: from %d to %d\n", __FUNCTION__, kgem->mode, new_mode));
-                _kgem_submit(kgem);
-        }
-        kgem->ring = new_mode;
-}
-static void
-gen6_render_retire(struct kgem *kgem)
-{
-        struct sna *sna;
-        if (kgem->ring && (kgem->has_semaphores || !kgem->need_retire))
-                kgem->ring = kgem->mode;
-        sna = container_of(kgem, struct sna, kgem);
-        if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
-                DBG(("%s: resetting idle vbo handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
-                sna->render.vertex_used = 0;
-                sna->render.vertex_index = 0;
-        }
-}
-static void
-gen6_render_expire(struct kgem *kgem)
-{
-        struct sna *sna;
-        sna = container_of(kgem, struct sna, kgem);
-        if (sna->render.vbo && !sna->render.vertex_used) {
-                DBG(("%s: discarding vbo handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
-                kgem_bo_destroy(kgem, sna->render.vbo);
-                assert(!sna->render.active);
-                sna->render.vbo = NULL;
-                sna->render.vertices = sna->render.vertex_data;
-                sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-                sna->render.vertex_used = 0;
-                sna->render.vertex_index = 0;
-        }
-}
 static void gen6_render_reset(struct sna *sna)
 {
         sna->render_state.gen6.needs_invariant = true;
 ,6 → 3266,11
         sna->render_state.gen6.drawrect_limit = -1;
         sna->render_state.gen6.surface_table = -1;
+        if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
+                DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
+                discard_vbo(sna);
+        }
         sna->render.vertex_offset = 0;
         sna->render.nvertex_reloc = 0;
         sna->render.vb_id = 0;
 ,17 → 3281,17
     kgem_bo_destroy(&sna->kgem, sna->render_state.gen6.general_bo);
 }
-static bool is_gt2(struct sna *sna)
+static bool is_gt2(struct sna *sna, int devid)
 {
-        return sna->PciInfo->device_id & 0x30;
+        return devid & 0x30;
 }
-static bool is_mobile(struct sna *sna)
+static bool is_mobile(struct sna *sna, int devid)
 {
-        return (sna->PciInfo->device_id & 0xf) == 0x6;
+        return (devid & 0xf) == 0x6;
 }
-static bool gen6_render_setup(struct sna *sna)
+static bool gen6_render_setup(struct sna *sna, int devid)
 {
         struct gen6_render_state *state = &sna->render_state.gen6;
         struct sna_static_stream general;
 ,8 → 3299,9
         int i, j, k, l, m;
         state->info = &gt1_info;
-        if (is_gt2(sna))
+        if (is_gt2(sna, devid))
                 state->info = &gt2_info; /* XXX requires GT_MODE WiZ disabled */
+        state->gt = state->info->gt;
     sna_static_stream_init(&general);
 ,12 → 3372,14
 const char *gen6_render_init(struct sna *sna, const char *backend)
 {
-    if (!gen6_render_setup(sna))
+        int devid = intel_get_device_id(sna);
+        if (!gen6_render_setup(sna, devid))
                 return backend;
         sna->kgem.context_switch = gen6_render_context_switch;
         sna->kgem.retire = gen6_render_retire;
-        sna->kgem.expire = gen6_render_expire;
+        sna->kgem.expire = gen4_render_expire;
 #if 0
 #if !NO_COMPOSITE
 ,7 → 3390,7
 #if !NO_COMPOSITE_SPANS
         sna->render.check_composite_spans = gen6_check_composite_spans;
         sna->render.composite_spans = gen6_render_composite_spans;
-        if (is_mobile(sna))
+        if (is_mobile(sna, devid))
                 sna->render.prefer_gpu |= PREFER_GPU_SPANS;
 #endif
         sna->render.video = gen6_render_video;
 ,7 → 3419,7
     sna->render.caps = HW_BIT_BLIT | HW_TEX_BLIT;
     sna->render.blit_tex = gen6_blit_tex;
-    sna->render.flush = gen6_render_flush;
+        sna->render.flush = gen4_render_flush;
     sna->render.reset = gen6_render_reset;
         sna->render.fini = gen6_render_fini;
 ,7 → 3522,7
 //    tmp->box   = gen6_render_composite_box;
         tmp->done  = gen6_render_composite_done;
-        kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
+    kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
         if (!kgem_check_bo(&sna->kgem,
                            tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
                            NULL)) {
 ,8 → 3530,8
                 _kgem_set_mode(&sna->kgem, KGEM_RENDER);
         }
+    gen6_align_vertex(sna, tmp);
     gen6_emit_composite_state(sna, tmp);
-    gen6_align_vertex(sna, tmp);
         return true;
 }

 /contrib/sdk/sources/Intel-2D/sna/gen7_render.c
 ,10 → 42,14
 #include "brw/brw.h"
 #include "gen7_render.h"
+#include "gen4_common.h"
 #include "gen4_source.h"
 #include "gen4_vertex.h"
+#include "gen6_common.h"
+#define ALWAYS_INVALIDATE 0
 #define ALWAYS_FLUSH 0
+#define ALWAYS_STALL 0
 #define NO_COMPOSITE 0
 #define NO_COMPOSITE_SPANS 0
 ,28 → 1026,39
                 const struct sna_composite_op *op,
                 uint16_t wm_binding_table)
 {
+        bool need_invalidate;
+        bool need_flush;
         bool need_stall;
         assert(op->dst.bo->exec);
-        gen7_emit_cc(sna, GEN7_BLEND(op->u.gen7.flags));
-        gen7_emit_sampler(sna, GEN7_SAMPLER(op->u.gen7.flags));
-        gen7_emit_sf(sna, GEN7_VERTEX(op->u.gen7.flags) >> 2);
-        gen7_emit_wm(sna, GEN7_KERNEL(op->u.gen7.flags));
-        gen7_emit_vertex_elements(sna, op);
+        need_invalidate = kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo);
+        if (ALWAYS_INVALIDATE)
+                need_invalidate = true;
-        need_stall = gen7_emit_binding_table(sna, wm_binding_table);
+        need_flush =
+                sna->render_state.gen7.emit_flush &&
+                wm_binding_table & GEN7_READS_DST(op->u.gen7.flags);
+        if (ALWAYS_FLUSH)
+                need_flush = true;
+        wm_binding_table &= ~1;
+        need_stall = sna->render_state.gen7.surface_table != wm_binding_table;
         need_stall &= gen7_emit_drawing_rectangle(sna, op);
+        if (ALWAYS_STALL)
+                need_stall = true;
-        if (ALWAYS_FLUSH || kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
+        if (need_invalidate) {
                 gen7_emit_pipe_invalidate(sna);
                 kgem_clear_dirty(&sna->kgem);
                 assert(op->dst.bo->exec);
                         kgem_bo_mark_dirty(op->dst.bo);
-                sna->render_state.gen7.emit_flush = false;
+                need_flush = false;
                 need_stall = false;
         }
-        if (sna->render_state.gen7.emit_flush) {
+        if (need_flush) {
                 gen7_emit_pipe_flush(sna, need_stall);
                 need_stall = false;
         }
 ,6 → 1065,13
         if (need_stall)
                 gen7_emit_pipe_stall(sna);
+        gen7_emit_cc(sna, GEN7_BLEND(op->u.gen7.flags));
+        gen7_emit_sampler(sna, GEN7_SAMPLER(op->u.gen7.flags));
+        gen7_emit_sf(sna, GEN7_VERTEX(op->u.gen7.flags) >> 2);
+        gen7_emit_wm(sna, GEN7_KERNEL(op->u.gen7.flags));
+        gen7_emit_vertex_elements(sna, op);
+        gen7_emit_binding_table(sna, wm_binding_table);
         sna->render_state.gen7.emit_flush = GEN7_READS_DST(op->u.gen7.flags);
 }
 ,12 → 1426,14
                                       const struct sna_composite_op *op)
 {
         uint32_t *binding_table;
-        uint16_t offset;
+        uint16_t offset, dirty;
         gen7_get_batch(sna, op);
         binding_table = gen7_composite_get_binding_table(sna, &offset);
+        dirty = kgem_bo_is_dirty(op->dst.bo);
         binding_table[0] =
                 gen7_bind_bo(sna,
                             op->dst.bo, op->dst.width, op->dst.height,
 ,7 → 1462,7
                 offset = sna->render_state.gen7.surface_table;
         }
-        gen7_emit_state(sna, op, offset);
+        gen7_emit_state(sna, op, offset | dirty);
 }
 static void
 ,16 → 1469,9
 gen7_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
         if (op->floats_per_vertex != sna->render_state.gen7.floats_per_vertex) {
-                if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-                        gen4_vertex_finish(sna);
-                DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
-                     sna->render_state.gen7.floats_per_vertex,
-                     op->floats_per_vertex,
-                     sna->render.vertex_index,
-                     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
-                sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
-                sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
+                DBG(("aligning vertex: was %d, now %d floats per vertex\n",
+                     sna->render_state.gen7.floats_per_vertex, op->floats_per_vertex));
+                gen4_vertex_align(sna, op);
                 sna->render_state.gen7.floats_per_vertex = op->floats_per_vertex;
         }
 }
 ,7 → 1565,7
         int src_height[6];
         int src_pitch[6];
         uint32_t *binding_table;
-        uint16_t offset;
+        uint16_t offset, dirty;
         int n_src, n;
         gen7_get_batch(sna, op);
 ,6 → 1603,8
         binding_table = gen7_composite_get_binding_table(sna, &offset);
+        dirty = kgem_bo_is_dirty(op->dst.bo);
         binding_table[0] =
                 gen7_bind_bo(sna,
                              op->dst.bo, op->dst.width, op->dst.height,
 ,7 → 1621,7
                                                src_surf_format);
         }
-        gen7_emit_state(sna, op, offset);
+        gen7_emit_state(sna, op, offset | dirty);
 }
 static bool
 ,12 → 1688,14
         kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
         if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
                 kgem_submit(&sna->kgem);
-                assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
+                if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
+                        return false;
                 _kgem_set_mode(&sna->kgem, KGEM_RENDER);
         }
+        gen7_align_vertex(sna, &tmp);
         gen7_emit_video_state(sna, &tmp);
-        gen7_align_vertex(sna, &tmp);
         /* Set up the offset for translating from the given region (in screen
          * coordinates) to the backing pixmap.
 ,7 → 1895,8
                 return false;
         }
-        if (prefer_blt_fill(sna, dst_bo) || !gen7_check_dst_format(format)) {
+        if (prefer_blt_fill(sna, dst_bo, FILL_BOXES) ||
+            !gen7_check_dst_format(format)) {
                 uint8_t alu = GXinvalid;
                 if (op <= PictOpSrc) {
 ,11 → 1971,17
         kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
         if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
                 kgem_submit(&sna->kgem);
-                assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+                if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
+                        kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+                        if (tmp.redirect.real_bo)
+                                kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
+                        return false;
+                }
+                _kgem_set_mode(&sna->kgem, KGEM_RENDER);
         }
+        gen7_align_vertex(sna, &tmp);
         gen7_emit_fill_state(sna, &tmp);
-        gen7_align_vertex(sna, &tmp);
         do {
                 int n_this_time;
 ,60 → 2015,6
 }
 #endif
-static void gen7_render_flush(struct sna *sna)
-{
-        gen4_vertex_close(sna);
-        assert(sna->render.vb_id == 0);
-        assert(sna->render.vertex_offset == 0);
-}
-static void
-gen7_render_context_switch(struct kgem *kgem,
-                           int new_mode)
-{
-        if (kgem->nbatch) {
-                DBG(("%s: switch rings %d -> %d\n",
-                     __FUNCTION__, kgem->mode, new_mode));
-                _kgem_submit(kgem);
-        }
-        kgem->ring = new_mode;
-}
-static void
-gen7_render_retire(struct kgem *kgem)
-{
-        struct sna *sna;
-        if (kgem->ring && (kgem->has_semaphores || !kgem->need_retire))
-                kgem->ring = kgem->mode;
-        sna = container_of(kgem, struct sna, kgem);
-        if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
-                DBG(("%s: resetting idle vbo\n", __FUNCTION__));
-                sna->render.vertex_used = 0;
-                sna->render.vertex_index = 0;
-        }
-}
-static void
-gen7_render_expire(struct kgem *kgem)
-{
-        struct sna *sna;
-        sna = container_of(kgem, struct sna, kgem);
-        if (sna->render.vbo && !sna->render.vertex_used) {
-                DBG(("%s: discarding vbo\n", __FUNCTION__));
-                kgem_bo_destroy(kgem, sna->render.vbo);
-                sna->render.vbo = NULL;
-                sna->render.vertices = sna->render.vertex_data;
-                sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-                sna->render.vertex_used = 0;
-                sna->render.vertex_index = 0;
-        }
-}
 static void gen7_render_reset(struct sna *sna)
 {
         sna->render_state.gen7.emit_flush = false;
 ,6 → 2030,11
         sna->render_state.gen7.drawrect_limit = -1;
         sna->render_state.gen7.surface_table = -1;
+        if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
+                DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
+                discard_vbo(sna);
+        }
         sna->render.vertex_offset = 0;
         sna->render.nvertex_reloc = 0;
         sna->render.vb_id = 0;
 ,23 → 2045,23
         kgem_bo_destroy(&sna->kgem, sna->render_state.gen7.general_bo);
 }
-static bool is_gt3(struct sna *sna)
+static bool is_gt3(struct sna *sna, int devid)
 {
         assert(sna->kgem.gen == 075);
-        return sna->PciInfo->device_id & 0x20;
+        return devid & 0x20;
 }
-static bool is_gt2(struct sna *sna)
+static bool is_gt2(struct sna *sna, int devid)
 {
-        return sna->PciInfo->device_id & (is_hsw(sna)? 0x30 : 0x20);
+        return devid & (is_hsw(sna)? 0x30 : 0x20);
 }
-static bool is_mobile(struct sna *sna)
+static bool is_mobile(struct sna *sna, int devid)
 {
-        return (sna->PciInfo->device_id & 0xf) == 0x6;
+        return (devid & 0xf) == 0x6;
 }
-static bool gen7_render_setup(struct sna *sna)
+static bool gen7_render_setup(struct sna *sna, int devid)
 {
     struct gen7_render_state *state = &sna->render_state.gen7;
     struct sna_static_stream general;
 ,9 → 2070,9
         if (is_ivb(sna)) {
         state->info = &ivb_gt_info;
-                if (sna->PciInfo->device_id & 0xf) {
+                if (devid & 0xf) {
             state->info = &ivb_gt1_info;
-            if (is_gt2(sna))
+                        if (is_gt2(sna, devid))
                 state->info = &ivb_gt2_info; /* XXX requires GT_MODE WiZ disabled */
         }
         } else if (is_byt(sna)) {
 ,10 → 2079,10
                 state->info = &byt_gt_info;
         } else if (is_hsw(sna)) {
         state->info = &hsw_gt_info;
-                if (sna->PciInfo->device_id & 0xf) {
-                        if (is_gt3(sna))
+                if (devid & 0xf) {
+                        if (is_gt3(sna, devid))
                                 state->info = &hsw_gt3_info;
-                        else if (is_gt2(sna))
+                        else if (is_gt2(sna, devid))
                                 state->info = &hsw_gt2_info;
                         else
             state->info = &hsw_gt1_info;
 ,6 → 2090,8
     } else
         return false;
+        state->gt = state->info->gt;
     sna_static_stream_init(&general);
     /* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
 ,12 → 2156,14
 const char *gen7_render_init(struct sna *sna, const char *backend)
 {
-    if (!gen7_render_setup(sna))
+        int devid = intel_get_device_id(sna);
+        if (!gen7_render_setup(sna, devid))
                 return backend;
-    sna->kgem.context_switch = gen7_render_context_switch;
-    sna->kgem.retire = gen7_render_retire;
-    sna->kgem.expire = gen7_render_expire;
+        sna->kgem.context_switch = gen6_render_context_switch;
+        sna->kgem.retire = gen6_render_retire;
+        sna->kgem.expire = gen4_render_expire;
 #if 0
 #if !NO_COMPOSITE
 ,7 → 2173,7
 #if !NO_COMPOSITE_SPANS
         sna->render.check_composite_spans = gen7_check_composite_spans;
         sna->render.composite_spans = gen7_render_composite_spans;
-        if (is_mobile(sna) || is_gt2(sna) || is_byt(sna))
+        if (is_mobile(sna, devid) || is_gt2(sna, devid) || is_byt(sna))
                 sna->render.prefer_gpu |= PREFER_GPU_SPANS;
 #endif
         sna->render.video = gen7_render_video;
 ,7 → 2202,7
     sna->render.blit_tex = gen7_blit_tex;
     sna->render.caps = HW_BIT_BLIT | HW_TEX_BLIT;
-    sna->render.flush = gen7_render_flush;
+        sna->render.flush = gen4_render_flush;
     sna->render.reset = gen7_render_reset;
     sna->render.fini = gen7_render_fini;
 ,7 → 2295,7
 //      tmp->box   = gen7_render_composite_box;
         tmp->done  = gen7_render_composite_done;
-        kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
+        kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
         if (!kgem_check_bo(&sna->kgem,
                            tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
                            NULL)) {
 ,7 → 2303,7
                 _kgem_set_mode(&sna->kgem, KGEM_RENDER);
         }
+        gen7_align_vertex(sna, tmp);
         gen7_emit_composite_state(sna, tmp);
-        gen7_align_vertex(sna, tmp);
         return true;
 }

 /contrib/sdk/sources/Intel-2D/sna/kgem.c
 ,7 → 47,6
 #include "sna_cpuid.h"
 static struct kgem_bo *
 search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 ,7 → 59,7
 #define DBG_NO_CACHE_LEVEL 0
 #define DBG_NO_CPU 0
 #define DBG_NO_CREATE2 1
-#define DBG_NO_USERPTR 0
+#define DBG_NO_USERPTR 1
 #define DBG_NO_UNSYNCHRONIZED_USERPTR 0
 #define DBG_NO_LLC 0
 #define DBG_NO_SEMAPHORES 0
 ,7 → 71,7
 #define DBG_NO_SECURE_BATCHES 0
 #define DBG_NO_PINNED_BATCHES 0
 #define DBG_NO_FAST_RELOC 0
-#define DBG_NO_HANDLE_LUT 1
+#define DBG_NO_HANDLE_LUT 0
 #define DBG_NO_WT 0
 #define DBG_DUMP 0
 ,10 → 104,8
 #define MAX_CPU_VMA_CACHE INT16_MAX
 #define MAP_PRESERVE_TIME 10
-#define MAKE_CPU_MAP(ptr) ((void*)((uintptr_t)(ptr) | 1))
-#define MAKE_USER_MAP(ptr) ((void*)((uintptr_t)(ptr) | 3))
-#define IS_USER_MAP(ptr) ((uintptr_t)(ptr) & 2)
-#define __MAP_TYPE(ptr) ((uintptr_t)(ptr) & 3)
+#define MAKE_USER_MAP(ptr) ((void*)((uintptr_t)(ptr) | 1))
+#define IS_USER_MAP(ptr) ((uintptr_t)(ptr) & 1)
 #define MAKE_REQUEST(rq, ring) ((struct kgem_request *)((uintptr_t)(rq) | (ring)))
 ,8 → 155,13
         uint32_t used;
         uint32_t need_io : 1;
         uint32_t write : 2;
-        uint32_t mmapped : 1;
+        uint32_t mmapped : 2;
 };
+enum {
+        MMAPPED_NONE,
+        MMAPPED_GTT,
+        MMAPPED_CPU
+};
 static struct kgem_bo *__kgem_freed_bo;
 static struct kgem_request *__kgem_freed_request;
 ,10 → 254,11
         return drmIoctl(fd, LOCAL_IOCTL_I915_GEM_SET_CACHING, &arg) == 0;
 }
+static uint32_t gem_userptr(int fd, void *ptr, int size, int read_only)
+{
+    return 0;
+}
 static bool __kgem_throttle_retire(struct kgem *kgem, unsigned flags)
 {
         if (flags & CREATE_NO_RETIRE) {
 ,24 → 292,23
              bo->handle, bytes(bo)));
         assert(bo->proxy == NULL);
         assert(!bo->snoop);
-        assert(kgem_bo_can_map(kgem, bo));
+        assert(num_pages(bo) <= kgem->aperture_mappable / 4);
 retry_gtt:
         VG_CLEAR(mmap_arg);
         mmap_arg.handle = bo->handle;
         if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg)) {
+                int err = 0;
                 (void)__kgem_throttle_retire(kgem, 0);
                 if (kgem_expire_cache(kgem))
                         goto retry_gtt;
-                if (kgem->need_expire) {
-                        kgem_cleanup_cache(kgem);
+                if (kgem_cleanup_cache(kgem))
                         goto retry_gtt;
-                }
-                printf("%s: failed to retrieve GTT offset for handle=%d\n",
-                       __FUNCTION__, bo->handle);
+                ErrorF("%s: failed to retrieve GTT offset for handle=%d: %d\n",
+                       __FUNCTION__, bo->handle, err);
                 return NULL;
         }
 ,7 → 323,7
         return ptr;
 }
-static int __gem_write(int fd, uint32_t handle,
+static int gem_write(int fd, uint32_t handle,
                        int offset, int length,
                        const void *src)
 {
 ,7 → 340,7
         return drmIoctl(fd, DRM_IOCTL_I915_GEM_PWRITE, &pwrite);
 }
-static int gem_write(int fd, uint32_t handle,
+static int gem_write__cachealigned(int fd, uint32_t handle,
                      int offset, int length,
                      const void *src)
 {
 ,7 → 633,7
 static unsigned
 cpu_cache_size__cpuid4(void)
 {
-        /* Deterministic Cache Parmaeters (Function 04h)":
+        /* Deterministic Cache Parameters (Function 04h)":
          *    When EAX is initialized to a value of 4, the CPUID instruction
          *    returns deterministic cache information in the EAX, EBX, ECX
          *    and EDX registers.  This function requires ECX be initialized
 ,7 → 757,7
          * hw acceleration.
          */
-        if (kgem->gen == 060 && dev->revision < 8) {
+        if (kgem->gen == 060 && dev && dev->revision < 8) {
                 /* pre-production SNB with dysfunctional BLT */
                 return false;
         }
 ,7 → 883,7
 static bool kgem_init_pinned_batches(struct kgem *kgem)
 {
-        int count[2] = { 2, 2 };
+        int count[2] = { 4, 4 };
         int size[2] = { 1, 2 };
         int n, i;
 ,6 → 913,7
                         pin.alignment = 0;
                         if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_PIN, &pin)) {
                                 gem_close(kgem->fd, pin.handle);
+                                free(bo);
                                 goto err;
                         }
                         bo->presumed_offset = pin.offset;
 ,7 → 1031,6
          kgem->has_no_reloc));
     kgem->has_handle_lut = test_has_handle_lut(kgem);
-    kgem->has_handle_lut = 0;
     DBG(("%s: has handle-lut? %d\n", __FUNCTION__,
          kgem->has_handle_lut));
 ,6 → 1044,10
     DBG(("%s: can blt to cpu? %d\n", __FUNCTION__,
          kgem->can_blt_cpu));
+        kgem->can_render_y = gen != 021 && (gen >> 3) != 4;
+        DBG(("%s: can render to Y-tiled surfaces? %d\n", __FUNCTION__,
+             kgem->can_render_y));
     kgem->has_secure_batches = test_has_secure_batches(kgem);
     DBG(("%s: can use privileged batchbuffers? %d\n", __FUNCTION__,
          kgem->has_secure_batches));
 ,6 → 1121,8
          kgem->aperture_low, kgem->aperture_low / (1024*1024),
          kgem->aperture_high, kgem->aperture_high / (1024*1024)));
+        kgem->aperture_mappable = 256 * 1024 * 1024;
+        if (dev != NULL)
     kgem->aperture_mappable = agp_aperture_size(dev, gen);
     if (kgem->aperture_mappable == 0 ||
         kgem->aperture_mappable > aperture.aper_size)
 ,6 → 1157,14
     if (kgem->max_gpu_size > totalram / 4)
         kgem->max_gpu_size = totalram / 4;
+        if (kgem->aperture_high > totalram / 2) {
+                kgem->aperture_high = totalram / 2;
+                kgem->aperture_low = kgem->aperture_high / 4;
+                DBG(("%s: reduced aperture watermaks to fit into ram; low=%d [%d], high=%d [%d]\n", __FUNCTION__,
+                     kgem->aperture_low, kgem->aperture_low / (1024*1024),
+                     kgem->aperture_high, kgem->aperture_high / (1024*1024)));
+        }
     kgem->max_cpu_size = kgem->max_object_size;
     half_gpu_max = kgem->max_gpu_size / 2;
 ,8 → 1213,10
          kgem->max_upload_tile_size, kgem->max_copy_tile_size));
     /* Convert the aperture thresholds to pages */
+        kgem->aperture_mappable /= PAGE_SIZE;
     kgem->aperture_low /= PAGE_SIZE;
     kgem->aperture_high /= PAGE_SIZE;
+        kgem->aperture_total /= PAGE_SIZE;
     kgem->fence_max = gem_param(kgem, I915_PARAM_NUM_FENCES_AVAIL) - 2;
     if ((int)kgem->fence_max < 0)
 ,7 → 1251,7
         return kgem->min_alignment;
 }
-void kgem_get_tile_size(struct kgem *kgem, int tiling,
+void kgem_get_tile_size(struct kgem *kgem, int tiling, int pitch,
                         int *tile_width, int *tile_height, int *tile_size)
 {
         if (kgem->gen <= 030) {
 ,6 → 1288,10
                 *tile_size = 4096;
                 break;
         }
+        /* Force offset alignment to tile-row */
+        if (tiling && kgem->gen < 033)
+                *tile_width = pitch;
 }
 uint32_t kgem_surface_size(struct kgem *kgem,
 ,10 → 1422,15
 static void kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo)
 {
+        assert(bo->refcnt);
+        assert(bo->proxy == NULL);
         bo->exec = kgem_add_handle(kgem, bo);
         bo->rq = MAKE_REQUEST(kgem->next_request, kgem->ring);
         list_move_tail(&bo->request, &kgem->next_request->buffers);
+        if (bo->io && !list_is_empty(&bo->list))
+                list_move(&bo->list, &kgem->batch_buffers);
         /* XXX is it worth working around gcc here? */
         kgem->flush |= bo->flush;
 ,31 → 1483,11
         b = bo->binding.next;
         while (b) {
                 struct kgem_bo_binding *next = b->next;
-                free (b);
+                free(b);
                 b = next;
         }
 }
-static void kgem_bo_release_map(struct kgem *kgem, struct kgem_bo *bo)
-{
-        int type = IS_CPU_MAP(bo->map);
-        assert(!IS_USER_MAP(bo->map));
-        DBG(("%s: releasing %s vma for handle=%d, count=%d\n",
-             __FUNCTION__, type ? "CPU" : "GTT",
-             bo->handle, kgem->vma[type].count));
-        VG(if (type) VALGRIND_MAKE_MEM_NOACCESS(MAP(bo->map), bytes(bo)));
-        user_free(MAP(bo->map));
-        bo->map = NULL;
-        if (!list_is_empty(&bo->vma)) {
-                list_del(&bo->vma);
-                kgem->vma[type].count--;
-        }
-}
 static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
 {
         DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
 ,22 → 1503,32
         kgem_bo_binding_free(kgem, bo);
-        if (IS_USER_MAP(bo->map)) {
+        if (IS_USER_MAP(bo->map__cpu)) {
                 assert(bo->rq == NULL);
                 assert(!__kgem_busy(kgem, bo->handle));
-                assert(MAP(bo->map) != bo || bo->io || bo->flush);
+                assert(MAP(bo->map__cpu) != bo || bo->io || bo->flush);
                 if (!(bo->io || bo->flush)) {
                         DBG(("%s: freeing snooped base\n", __FUNCTION__));
-                        assert(bo != MAP(bo->map));
-                        free(MAP(bo->map));
+                        assert(bo != MAP(bo->map__cpu));
+                        free(MAP(bo->map__cpu));
                 }
-                bo->map = NULL;
+                bo->map__cpu = NULL;
         }
-        if (bo->map)
-                kgem_bo_release_map(kgem, bo);
-        assert(list_is_empty(&bo->vma));
-        assert(bo->map == NULL);
+        DBG(("%s: releasing %p:%p vma for handle=%d, count=%d\n",
+             __FUNCTION__, bo->map__gtt, bo->map__cpu,
+             bo->handle, list_is_empty(&bo->vma) ? 0 : kgem->vma[bo->map__gtt == NULL].count));
+        if (!list_is_empty(&bo->vma)) {
+                _list_del(&bo->vma);
+                kgem->vma[bo->map__gtt == NULL].count--;
+        }
+//   if (bo->map__gtt)
+//       munmap(MAP(bo->map__gtt), bytes(bo));
+//   if (bo->map__cpu)
+//       munmap(MAP(bo->map__cpu), bytes(bo));
         _list_del(&bo->list);
         _list_del(&bo->request);
         gem_close(kgem->fd, bo->handle);
 ,22 → 1563,28
         kgem->need_expire = true;
         if (bucket(bo) >= NUM_CACHE_BUCKETS) {
-                list_move(&bo->list, &kgem->large_inactive);
-                return;
+                if (bo->map__gtt) {
+//           munmap(MAP(bo->map__gtt), bytes(bo));
+                        bo->map__gtt = NULL;
         }
+                list_move(&bo->list, &kgem->large_inactive);
+        } else {
         assert(bo->flush == false);
         list_move(&bo->list, &kgem->inactive[bucket(bo)]);
-        if (bo->map) {
-                int type = IS_CPU_MAP(bo->map);
-                if (bucket(bo) >= NUM_CACHE_BUCKETS ||
-                    (!type && !__kgem_bo_is_mappable(kgem, bo))) {
-//                      munmap(MAP(bo->map), bytes(bo));
-                        bo->map = NULL;
+                if (bo->map__gtt) {
+                        if (!kgem_bo_can_map(kgem, bo)) {
+//                              munmap(MAP(bo->map__gtt), bytes(bo));
+                                bo->map__gtt = NULL;
+                        }
+                        if (bo->map__gtt) {
+                                list_add(&bo->vma, &kgem->vma[0].inactive[bucket(bo)]);
+                                kgem->vma[0].count++;
+                        }
                 }
-                if (bo->map) {
-                        list_add(&bo->vma, &kgem->vma[type].inactive[bucket(bo)]);
-                        kgem->vma[type].count++;
+                if (bo->map__cpu && !bo->map__gtt) {
+                        list_add(&bo->vma, &kgem->vma[1].inactive[bucket(bo)]);
+                        kgem->vma[1].count++;
                 }
         }
 }
 ,6 → 1597,10
                 return bo;
         assert(!bo->snoop);
+        if (__kgem_freed_bo) {
+                base = __kgem_freed_bo;
+                __kgem_freed_bo = *(struct kgem_bo **)base;
+        } else
         base = malloc(sizeof(*base));
         if (base) {
                 DBG(("%s: transferring io handle=%d to bo\n",
 ,10 → 1627,10
         list_del(&bo->list);
         assert(bo->rq == NULL);
         assert(bo->exec == NULL);
-        if (bo->map) {
-                assert(!list_is_empty(&bo->vma));
+        if (!list_is_empty(&bo->vma)) {
+                assert(bo->map__gtt || bo->map__cpu);
                 list_del(&bo->vma);
-                kgem->vma[IS_CPU_MAP(bo->map)].count--;
+                kgem->vma[bo->map__gtt == NULL].count--;
         }
 }
 ,8 → 1641,10
         list_del(&bo->list);
         assert(bo->rq != NULL);
-        if (bo->rq == (void *)kgem)
+        if (RQ(bo->rq) == (void *)kgem) {
+                assert(bo->exec == NULL);
                 list_del(&bo->request);
+        }
         assert(list_is_empty(&bo->vma));
 }
 ,6 → 1769,7
         if (kgem->nexec != 1 || bo->exec == NULL)
                 return;
+        assert(bo);
         DBG(("%s: only handle in batch, discarding last operations for handle=%d\n",
              __FUNCTION__, bo->handle));
 ,6 → 1780,10
         bo->refcnt++;
         kgem_reset(kgem);
         bo->refcnt--;
+        assert(kgem->nreloc == 0);
+        assert(kgem->nexec == 0);
+        assert(bo->exec == NULL);
 }
 static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 ,7 → 1811,7
                         kgem_bo_move_to_snoop(kgem, bo);
                 return;
         }
-        if (!IS_USER_MAP(bo->map))
+        if (!IS_USER_MAP(bo->map__cpu))
                 bo->flush = false;
         if (bo->scanout) {
 ,9 → 1827,6
                 goto destroy;
         }
-        if (!kgem->has_llc && IS_CPU_MAP(bo->map) && bo->domain != DOMAIN_CPU)
-                kgem_bo_release_map(kgem, bo);
         assert(list_is_empty(&bo->vma));
         assert(list_is_empty(&bo->list));
         assert(bo->flush == false);
 ,7 → 1855,7
         assert(bo->exec == NULL);
         assert(list_is_empty(&bo->request));
-        if (!IS_CPU_MAP(bo->map)) {
+        if (bo->map__cpu == NULL || bucket(bo) >= NUM_CACHE_BUCKETS) {
                 if (!kgem_bo_set_purgeable(kgem, bo))
                         goto destroy;
 ,16 → 1883,18
 static void kgem_buffer_release(struct kgem *kgem, struct kgem_buffer *bo)
 {
+        assert(bo->base.io);
         while (!list_is_empty(&bo->base.vma)) {
                 struct kgem_bo *cached;
                 cached = list_first_entry(&bo->base.vma, struct kgem_bo, vma);
                 assert(cached->proxy == &bo->base);
+                assert(cached != &bo->base);
                 list_del(&cached->vma);
-                assert(*(struct kgem_bo **)cached->map == cached);
-                *(struct kgem_bo **)cached->map = NULL;
-                cached->map = NULL;
+                assert(*(struct kgem_bo **)cached->map__gtt == cached);
+                *(struct kgem_bo **)cached->map__gtt = NULL;
+                cached->map__gtt = NULL;
                 kgem_bo_destroy(kgem, cached);
         }
 ,6 → 1910,10
                                         struct kgem_buffer,
                                         base.list);
+                DBG(("%s: handle=%d, busy? %d [%d]\n",
+                     __FUNCTION__, bo->base.handle, bo->base.rq != NULL, bo->base.exec != NULL));
+                assert(bo->base.exec == NULL || RQ(bo->base.rq) == kgem->next_request);
                 if (bo->base.rq)
                         break;
 ,7 → 1934,7
         bool retired = false;
         list_for_each_entry_safe(bo, next, &kgem->flushing, request) {
-                assert(bo->rq == (void *)kgem);
+                assert(RQ(bo->rq) == (void *)kgem);
                 assert(bo->exec == NULL);
                 if (__kgem_busy(kgem, bo->handle))
 ,7 → 1997,8
                         DBG(("%s: moving %d to flushing\n",
                              __FUNCTION__, bo->handle));
                         list_add(&bo->request, &kgem->flushing);
-                        bo->rq = (void *)kgem;
+                        bo->rq = MAKE_REQUEST(kgem, RQ_RING(bo->rq));
+                        kgem->need_retire = true;
                         continue;
                 }
 ,6 → 2023,7
         }
         assert(rq->bo->rq == NULL);
+        assert(rq->bo->exec == NULL);
         assert(list_is_empty(&rq->bo->request));
         if (--rq->bo->refcnt == 0) {
 ,7 → 2096,7
 {
         bool retired = false;
-        DBG(("%s\n", __FUNCTION__));
+        DBG(("%s, need_retire?=%d\n", __FUNCTION__, kgem->need_retire));
         kgem->need_retire = false;
 ,6 → 2116,7
 {
         struct kgem_request *rq;
+        assert(ring < ARRAY_SIZE(kgem->requests));
         assert(!list_is_empty(&kgem->requests[ring]));
         rq = list_last_entry(&kgem->requests[ring],
 ,10 → 2131,24
              __FUNCTION__, ring, rq->bo->handle));
         kgem_retire__requests_ring(kgem, ring);
+        kgem_retire__buffers(kgem);
         assert(list_is_empty(&kgem->requests[ring]));
         return true;
 }
+#ifndef NDEBUG
+static void kgem_commit__check_buffers(struct kgem *kgem)
+{
+        struct kgem_buffer *bo;
+        list_for_each_entry(bo, &kgem->active_buffers, base.list)
+                assert(bo->base.exec == NULL);
+}
+#else
+#define kgem_commit__check_buffers(kgem)
+#endif
 static void kgem_commit(struct kgem *kgem)
 {
         struct kgem_request *rq = kgem->next_request;
 ,6 → 2172,7
                 if (!bo->refcnt && !bo->reusable) {
                         assert(!bo->snoop);
+                        assert(!bo->proxy);
                         kgem_bo_free(kgem, bo);
                         continue;
                 }
 ,7 → 2183,6
                 if (bo->proxy) {
                         /* proxies are not used for domain tracking */
-                        bo->exec = NULL;
                         __kgem_bo_clear_busy(bo);
                 }
 ,7 → 2206,8
                 kgem_retire(kgem);
                 assert(list_is_empty(&rq->buffers));
-                assert(rq->bo->map == NULL);
+                assert(rq->bo->map__gtt == NULL);
+                assert(rq->bo->map__cpu == NULL);
                 gem_close(kgem->fd, rq->bo->handle);
                 kgem_cleanup_cache(kgem);
         } else {
 ,6 → 2216,8
         }
         kgem->next_request = NULL;
+        kgem_commit__check_buffers(kgem);
 }
 static void kgem_close_list(struct kgem *kgem, struct list *head)
 ,17 → 2239,18
         struct kgem_buffer *bo, *next;
         list_for_each_entry_safe(bo, next, &kgem->batch_buffers, base.list) {
-                DBG(("%s: buffer handle=%d, used=%d, exec?=%d, write=%d, mmapped=%s\n",
+                DBG(("%s: buffer handle=%d, used=%d, exec?=%d, write=%d, mmapped=%s, refcnt=%d\n",
                      __FUNCTION__, bo->base.handle, bo->used, bo->base.exec!=NULL,
-                     bo->write, bo->mmapped ? IS_CPU_MAP(bo->base.map) ? "cpu" : "gtt" : "no"));
+                     bo->write, bo->mmapped == MMAPPED_CPU ? "cpu" : bo->mmapped == MMAPPED_GTT ? "gtt" : "no",
+                     bo->base.refcnt));
                 assert(next->base.list.prev == &bo->base.list);
                 assert(bo->base.io);
                 assert(bo->base.refcnt >= 1);
-                if (!bo->base.exec) {
-                        DBG(("%s: skipping unattached handle=%d, used=%d\n",
-                             __FUNCTION__, bo->base.handle, bo->used));
+                if (bo->base.refcnt > 1 && !bo->base.exec) {
+                        DBG(("%s: skipping unattached handle=%d, used=%d, refcnt=%d\n",
+                             __FUNCTION__, bo->base.handle, bo->used, bo->base.refcnt));
                         continue;
                 }
 ,7 → 2260,7
                 }
                 if (bo->mmapped) {
-                        int used;
+                        uint32_t used;
                         assert(!bo->need_io);
 ,20 → 2267,21
                         used = ALIGN(bo->used, PAGE_SIZE);
                         if (!DBG_NO_UPLOAD_ACTIVE &&
                             used + PAGE_SIZE <= bytes(&bo->base) &&
-                            (kgem->has_llc || !IS_CPU_MAP(bo->base.map) || bo->base.snoop)) {
-                                DBG(("%s: retaining upload buffer (%d/%d)\n",
-                                     __FUNCTION__, bo->used, bytes(&bo->base)));
+                            (kgem->has_llc || bo->mmapped == MMAPPED_GTT || bo->base.snoop)) {
+                                DBG(("%s: retaining upload buffer (%d/%d): used=%d, refcnt=%d\n",
+                                     __FUNCTION__, bo->used, bytes(&bo->base), used, bo->base.refcnt));
                                 bo->used = used;
                                 list_move(&bo->base.list,
                                           &kgem->active_buffers);
+                                kgem->need_retire = true;
                                 continue;
                         }
                         DBG(("%s: discarding mmapped buffer, used=%d, map type=%d\n",
-                             __FUNCTION__, bo->used, (int)__MAP_TYPE(bo->base.map)));
+                             __FUNCTION__, bo->used, bo->mmapped));
                         goto decouple;
                 }
-                if (!bo->used) {
+                if (!bo->used || !bo->base.exec) {
                         /* Unless we replace the handle in the execbuffer,
                          * then this bo will become active. So decouple it
                          * from the buffer list and track it in the normal
 ,7 → 2360,7
                                      bo->base.handle, shrink->handle));
                                 assert(bo->used <= bytes(shrink));
-                                if (gem_write(kgem->fd, shrink->handle,
+                                if (gem_write__cachealigned(kgem->fd, shrink->handle,
 , bo->used, bo->mem) == 0) {
                                         shrink->target_handle =
                                                 kgem->has_handle_lut ? bo->base.target_handle : shrink->handle;
 ,7 → 2399,7
                      __FUNCTION__, bo->base.handle, bo->used, bytes(&bo->base)));
                 ASSERT_IDLE(kgem, bo->base.handle);
                 assert(bo->used <= bytes(&bo->base));
-                gem_write(kgem->fd, bo->base.handle,
+                gem_write__cachealigned(kgem->fd, bo->base.handle,
 , bo->used, bo->mem);
                 bo->need_io = 0;
 ,33 → 2449,58
         ASSERT_IDLE(kgem, handle);
+retry:
         /* If there is no surface data, just upload the batch */
-        if (kgem->surface == kgem->batch_size)
-                return gem_write(kgem->fd, handle,
+        if (kgem->surface == kgem->batch_size) {
+                if (gem_write__cachealigned(kgem->fd, handle,
 , sizeof(uint32_t)*kgem->nbatch,
-                                 kgem->batch);
+                                            kgem->batch) == 0)
+                        return 0;
+                goto expire;
+        }
         /* Are the batch pages conjoint with the surface pages? */
         if (kgem->surface < kgem->nbatch + PAGE_SIZE/sizeof(uint32_t)) {
                 assert(size == PAGE_ALIGN(kgem->batch_size*sizeof(uint32_t)));
-                return gem_write(kgem->fd, handle,
+                if (gem_write__cachealigned(kgem->fd, handle,
 , kgem->batch_size*sizeof(uint32_t),
-                                 kgem->batch);
+                                            kgem->batch) == 0)
+                        return 0;
+                goto expire;
         }
         /* Disjoint surface/batch, upload separately */
-        ret = gem_write(kgem->fd, handle,
+        if (gem_write__cachealigned(kgem->fd, handle,
 , sizeof(uint32_t)*kgem->nbatch,
-                        kgem->batch);
-        if (ret)
-                return ret;
+                                    kgem->batch))
+                goto expire;
         ret = PAGE_ALIGN(sizeof(uint32_t) * kgem->batch_size);
         ret -= sizeof(uint32_t) * kgem->surface;
         assert(size-ret >= kgem->nbatch*sizeof(uint32_t));
-        return __gem_write(kgem->fd, handle,
+        if (gem_write(kgem->fd, handle,
                         size - ret, (kgem->batch_size - kgem->surface)*sizeof(uint32_t),
-                        kgem->batch + kgem->surface);
+                      kgem->batch + kgem->surface))
+                goto expire;
+        return 0;
+expire:
+        ret = errno;
+        assert(ret != EINVAL);
+        (void)__kgem_throttle_retire(kgem, 0);
+        if (kgem_expire_cache(kgem))
+                goto retry;
+        if (kgem_cleanup_cache(kgem))
+                goto retry;
+        ErrorF("%s: failed to write batch (handle=%d): %d\n",
+               __FUNCTION__, handle, ret);
+        return ret;
 }
 void kgem_reset(struct kgem *kgem)
 ,6 → 2526,7
                                 assert(bo->domain == DOMAIN_GPU || bo->domain == DOMAIN_NONE);
                                 list_add(&bo->request, &kgem->flushing);
                                 bo->rq = (void *)kgem;
+                                kgem->need_retire = true;
                         } else
                                 __kgem_bo_clear_busy(bo);
 ,6 → 2559,7
         kgem->nreloc__self = 0;
         kgem->aperture = 0;
         kgem->aperture_fenced = 0;
+        kgem->aperture_max_fence = 0;
         kgem->nbatch = 0;
         kgem->surface = kgem->batch_size;
         kgem->mode = KGEM_NONE;
 ,10 → 2685,10
         batch_end = kgem_end_batch(kgem);
         kgem_sna_flush(kgem);
-        DBG(("batch[%d/%d, flags=%x]: %d %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d\n",
+        DBG(("batch[%d/%d, flags=%x]: %d %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d [fenced=%d]\n",
              kgem->mode, kgem->ring, kgem->batch_flags,
              batch_end, kgem->nbatch, kgem->surface, kgem->batch_size,
-             kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture));
+             kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture, kgem->aperture_fenced));
         assert(kgem->nbatch <= kgem->batch_size);
         assert(kgem->nbatch <= kgem->surface);
 ,8 → 2746,8
             {
                 int fd = open("/tmp1/1/batchbuffer.bin", O_CREAT|O_WRONLY|O_BINARY);
                                 if (fd != -1) {
-                    write(fd, kgem->batch, size);
-                                        close(fd);
+                                        ret = write(fd, kgem->batch, batch_end*sizeof(uint32_t));
+                                        fd = close(fd);
                                 }
                 else printf("SNA: failed to write batchbuffer\n");
                 asm volatile("int3");
 ,9 → 2780,9
 #if 0
                                 ret = errno;
-                                ErrorF("batch[%d/%d]: %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d: errno=%d\n",
+                                ErrorF("batch[%d/%d]: %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d, fenced=%d, high=%d,%d: errno=%d\n",
                                        kgem->mode, kgem->ring, batch_end, kgem->nbatch, kgem->surface,
-                                       kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture, errno);
+                                       kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture, kgem->aperture_fenced, kgem->aperture_high, kgem->aperture_total, errno);
                                 for (i = 0; i < kgem->nexec; i++) {
                                         struct kgem_bo *bo, *found = NULL;
 ,7 → 2850,7
         }
 }
-void kgem_purge_cache(struct kgem *kgem)
+static void kgem_purge_cache(struct kgem *kgem)
 {
         struct kgem_bo *bo, *next;
         int i;
 ,7 → 2978,7
                                 break;
                         }
-                        if (bo->map && bo->delta + MAP_PRESERVE_TIME > expire) {
+                        if (bo->map__cpu && bo->delta + MAP_PRESERVE_TIME > expire) {
                                 idle = false;
                                 list_move_tail(&bo->list, &preserve);
                         } else {
 ,7 → 3018,7
         (void)size;
 }
-void kgem_cleanup_cache(struct kgem *kgem)
+bool kgem_cleanup_cache(struct kgem *kgem)
 {
         unsigned int i;
         int n;
 ,6 → 3048,9
         kgem_retire(kgem);
         kgem_cleanup(kgem);
+        if (!kgem->need_expire)
+                return false;
         for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
                 while (!list_is_empty(&kgem->inactive[i]))
                         kgem_bo_free(kgem,
 ,6 → 3073,7
         kgem->need_purge = false;
         kgem->need_expire = false;
+        return true;
 }
 static struct kgem_bo *
 ,8 → 3118,10
                                 goto discard;
                         list_del(&bo->list);
-                        if (bo->rq == (void *)kgem)
+                        if (RQ(bo->rq) == (void *)kgem) {
+                                assert(bo->exec == NULL);
                                 list_del(&bo->request);
+                        }
                         bo->delta = 0;
                         assert_tiling(kgem, bo);
 ,7 → 3175,7
                      __FUNCTION__, for_cpu ? "cpu" : "gtt"));
                 cache = &kgem->vma[for_cpu].inactive[cache_bucket(num_pages)];
                 list_for_each_entry(bo, cache, vma) {
-                        assert(IS_CPU_MAP(bo->map) == for_cpu);
+                        assert(for_cpu ? bo->map__cpu : bo->map__gtt);
                         assert(bucket(bo) == cache_bucket(num_pages));
                         assert(bo->proxy == NULL);
                         assert(bo->rq == NULL);
 ,6 → 3199,8
                                 continue;
                         kgem_bo_remove_from_inactive(kgem, bo);
+                        assert(list_is_empty(&bo->vma));
+                        assert(list_is_empty(&bo->list));
                         bo->tiling = I915_TILING_NONE;
                         bo->pitch = 0;
 ,10 → 3257,10
                         bo->pitch = 0;
                 }
-                if (bo->map) {
+                if (bo->map__gtt || bo->map__cpu) {
                         if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
                                 int for_cpu = !!(flags & CREATE_CPU_MAP);
-                                if (IS_CPU_MAP(bo->map) != for_cpu) {
+                                if (for_cpu ? bo->map__cpu : bo->map__gtt){
                                         if (first != NULL)
                                                 break;
 ,6 → 3275,9
                                 continue;
                         }
                 } else {
+                        if (flags & CREATE_GTT_MAP && !kgem_bo_can_map(kgem, bo))
+                                continue;
                         if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
                                 if (first != NULL)
                                         break;
 ,6 → 3299,7
                      __FUNCTION__, bo->handle, num_pages(bo),
                      use_active ? "active" : "inactive"));
                 assert(list_is_empty(&bo->list));
+                assert(list_is_empty(&bo->vma));
                 assert(use_active || bo->domain != DOMAIN_GPU);
                 assert(!bo->needs_flush || use_active);
                 assert_tiling(kgem, bo);
 ,6 → 3321,7
                      __FUNCTION__, first->handle, num_pages(first),
                      use_active ? "active" : "inactive"));
                 assert(list_is_empty(&first->list));
+                assert(list_is_empty(&first->vma));
                 assert(use_active || first->domain != DOMAIN_GPU);
                 assert(!first->needs_flush || use_active);
                 ASSERT_MAYBE_IDLE(kgem, first->handle, !use_active);
 ,11 → 3381,11
         assert(kgem->gen < 040);
         if (kgem->gen < 030)
-                size = 512 * 1024;
+                size = 512 * 1024 / PAGE_SIZE;
         else
-                size = 1024 * 1024;
-        while (size < bytes(bo))
-                size *= 2;
+                size = 1024 * 1024 / PAGE_SIZE;
+        while (size < num_pages(bo))
+                size <<= 1;
         return size;
 }
 ,7 → 3407,6
         if (tiling < 0)
                 exact = true, tiling = -tiling;
         DBG(("%s(%dx%d, bpp=%d, tiling=%d, exact=%d, inactive=%d, cpu-mapping=%d, gtt-mapping=%d, scanout?=%d, prime?=%d, temp?=%d)\n", __FUNCTION__,
              width, height, bpp, tiling, exact,
              !!(flags & CREATE_INACTIVE),
 ,61 → 3422,6
         size /= PAGE_SIZE;
         bucket = cache_bucket(size);
-        if (flags & CREATE_SCANOUT) {
-                struct kgem_bo *last = NULL;
-                list_for_each_entry_reverse(bo, &kgem->scanout, list) {
-                        assert(bo->scanout);
-                        assert(bo->delta);
-                        assert(!bo->flush);
-                        assert_tiling(kgem, bo);
-                        if (size > num_pages(bo) || num_pages(bo) > 2*size)
-                                continue;
-                        if (bo->tiling != tiling ||
-                            (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
-                                if (!gem_set_tiling(kgem->fd, bo->handle,
-                                                    tiling, pitch))
-                                        continue;
-                                bo->tiling = tiling;
-                                bo->pitch = pitch;
-                        }
-                        if (flags & CREATE_INACTIVE && bo->rq) {
-                                last = bo;
-                                continue;
-                        }
-                        list_del(&bo->list);
-                        bo->unique_id = kgem_get_unique_id(kgem);
-                        DBG(("  1:from scanout: pitch=%d, tiling=%d, handle=%d, id=%d\n",
-                             bo->pitch, bo->tiling, bo->handle, bo->unique_id));
-                        assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
-                        assert_tiling(kgem, bo);
-                        bo->refcnt = 1;
-                        return bo;
-                }
-                if (last) {
-                        list_del(&last->list);
-                        last->unique_id = kgem_get_unique_id(kgem);
-                        DBG(("  1:from scanout: pitch=%d, tiling=%d, handle=%d, id=%d\n",
-                             last->pitch, last->tiling, last->handle, last->unique_id));
-                        assert(last->pitch*kgem_aligned_height(kgem, height, last->tiling) <= kgem_bo_size(last));
-                        assert_tiling(kgem, last);
-                        last->refcnt = 1;
-                        return last;
-                }
-                bo = NULL; //__kgem_bo_create_as_display(kgem, size, tiling, pitch);
-                if (bo)
-                        return bo;
-        }
         if (bucket >= NUM_CACHE_BUCKETS) {
                 DBG(("%s: large bo num pages=%d, bucket=%d\n",
                      __FUNCTION__, size, bucket));
 ,7 → 3471,6
                         assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
                         assert_tiling(kgem, bo);
                         bo->refcnt = 1;
-                        bo->flush = true;
                         return bo;
                 }
 ,9 → 3530,9
                                 assert(bucket(bo) == bucket);
                                 assert(bo->refcnt == 0);
                                 assert(!bo->scanout);
-                                assert(bo->map);
-                                assert(IS_CPU_MAP(bo->map) == for_cpu);
+                                assert(for_cpu ? bo->map__cpu : bo->map__gtt);
                                 assert(bo->rq == NULL);
+                                assert(bo->exec == NULL);
                                 assert(list_is_empty(&bo->request));
                                 assert(bo->flush == false);
                                 assert_tiling(kgem, bo);
 ,6 → 3562,8
                                 bo->domain = DOMAIN_NONE;
                                 kgem_bo_remove_from_inactive(kgem, bo);
+                                assert(list_is_empty(&bo->list));
+                                assert(list_is_empty(&bo->vma));
                                 DBG(("  from inactive vma: pitch=%d, tiling=%d: handle=%d, id=%d\n",
                                      bo->pitch, bo->tiling, bo->handle, bo->unique_id));
 ,9 → 3784,6
                         if (!gem_set_tiling(kgem->fd, bo->handle,
                                             tiling, pitch))
                                 continue;
-                        if (bo->map)
-                                kgem_bo_release_map(kgem, bo);
                 }
                 if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
 ,6 → 3792,8
                 }
                 kgem_bo_remove_from_inactive(kgem, bo);
+                assert(list_is_empty(&bo->list));
+                assert(list_is_empty(&bo->vma));
                 bo->pitch = pitch;
                 bo->tiling = tiling;
 ,12 → 3842,6
                 return NULL;
         }
-        if (bucket >= NUM_CACHE_BUCKETS) {
-                DBG(("%s: marking large bo for automatic flushing\n",
-                     __FUNCTION__));
-                bo->flush = true;
-        }
         bo->unique_id = kgem_get_unique_id(kgem);
         if (tiling == I915_TILING_NONE ||
             gem_set_tiling(kgem->fd, handle, tiling, pitch)) {
 ,16 → 3972,21
              __FUNCTION__, bo->handle, bo->proxy != NULL));
         if (bo->proxy) {
+                assert(!bo->reusable);
+                kgem_bo_binding_free(kgem, bo);
+                assert(list_is_empty(&bo->list));
                 _list_del(&bo->vma);
                 _list_del(&bo->request);
-                if (bo->io && bo->exec == NULL)
+                if (bo->io && bo->domain == DOMAIN_CPU)
                         _kgem_bo_delete_buffer(kgem, bo);
                 kgem_bo_unref(kgem, bo->proxy);
-                kgem_bo_binding_free(kgem, bo);
-                free(bo);
-                return;
-                }
+                *(struct kgem_bo **)bo = __kgem_freed_bo;
+                __kgem_freed_bo = bo;
+        } else
         __kgem_bo_destroy(kgem, bo);
 }
 ,6 → 4031,58
         return kgem->nreloc && bo->rq && RQ_RING(bo->rq) != kgem->ring;
 }
+static bool aperture_check(struct kgem *kgem, unsigned num_pages)
+{
+        if (kgem->aperture) {
+                struct drm_i915_gem_get_aperture aperture;
+                VG_CLEAR(aperture);
+                aperture.aper_available_size = kgem->aperture_high;
+                aperture.aper_available_size *= PAGE_SIZE;
+                (void)drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
+                DBG(("%s: aperture required %ld bytes, available %ld bytes\n",
+                     __FUNCTION__,
+                     (long)num_pages * PAGE_SIZE,
+                     (long)aperture.aper_available_size));
+                /* Leave some space in case of alignment issues */
+                aperture.aper_available_size -= 1024 * 1024;
+                aperture.aper_available_size -= kgem->aperture_mappable * PAGE_SIZE / 2;
+                if (kgem->gen < 033)
+                        aperture.aper_available_size -= kgem->aperture_max_fence * PAGE_SIZE;
+                if (!kgem->has_llc)
+                        aperture.aper_available_size -= 2 * kgem->nexec * PAGE_SIZE;
+                DBG(("%s: num_pages=%d, estimated max usable=%ld\n",
+                     __FUNCTION__, num_pages, (long)(aperture.aper_available_size/PAGE_SIZE)));
+                if (num_pages <= aperture.aper_available_size / PAGE_SIZE)
+                        return true;
+        }
+        return false;
+}
+static inline bool kgem_flush(struct kgem *kgem, bool flush)
+{
+        if (unlikely(kgem->wedged))
+                return false;
+        if (kgem->nreloc == 0)
+                return true;
+        if (container_of(kgem, struct sna, kgem)->flags & SNA_POWERSAVE)
+                return true;
+        if (kgem->flush == flush && kgem->aperture < kgem->aperture_low)
+                return true;
+        DBG(("%s: opportunistic flushing? flush=%d,%d, aperture=%d/%d, idle?=%d\n",
+             __FUNCTION__, kgem->flush, flush, kgem->aperture, kgem->aperture_low, kgem_ring_is_idle(kgem, kgem->ring)));
+        return !kgem_ring_is_idle(kgem, kgem->ring);
+}
 bool kgem_check_bo(struct kgem *kgem, ...)
 {
         va_list ap;
 ,6 → 4090,7
         int num_exec = 0;
         int num_pages = 0;
         bool flush = false;
+        bool busy = true;
         va_start(ap, kgem);
         while ((bo = va_arg(ap, struct kgem_bo *))) {
 ,13 → 4099,16
                 if (bo->exec)
                         continue;
-                if (needs_semaphore(kgem, bo))
+                if (needs_semaphore(kgem, bo)) {
+                        DBG(("%s: flushing for required semaphore\n", __FUNCTION__));
                         return false;
+                }
                 num_pages += num_pages(bo);
                 num_exec++;
                 flush |= bo->flush;
+                busy &= bo->rq != NULL;
         }
         va_end(ap);
 ,47 → 4118,129
         if (!num_pages)
                 return true;
-        if (kgem_flush(kgem, flush))
+        if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem)) {
+                DBG(("%s: out of exec slots (%d + %d / %d)\n", __FUNCTION__,
+                     kgem->nexec, num_exec, KGEM_EXEC_SIZE(kgem)));
                 return false;
-        if (kgem->aperture > kgem->aperture_low &&
-            kgem_ring_is_idle(kgem, kgem->ring)) {
-                DBG(("%s: current aperture usage (%d) is greater than low water mark (%d)\n",
-                     __FUNCTION__, kgem->aperture, kgem->aperture_low));
-                return false;
         }
         if (num_pages + kgem->aperture > kgem->aperture_high) {
                 DBG(("%s: final aperture usage (%d) is greater than high water mark (%d)\n",
                      __FUNCTION__, num_pages + kgem->aperture, kgem->aperture_high));
+                if (!aperture_check(kgem, num_pages + kgem->aperture))
                 return false;
         }
-        if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem)) {
-                DBG(("%s: out of exec slots (%d + %d / %d)\n", __FUNCTION__,
-                     kgem->nexec, num_exec, KGEM_EXEC_SIZE(kgem)));
-                return false;
-        }
+        if (busy)
+                return true;
-        return true;
+        return kgem_flush(kgem, flush);
 }
+#if 0
+bool kgem_check_bo_fenced(struct kgem *kgem, struct kgem_bo *bo)
+{
+        assert(bo->refcnt);
+        while (bo->proxy)
+                bo = bo->proxy;
+        assert(bo->refcnt);
+        if (bo->exec) {
+                if (kgem->gen < 040 &&
+                    bo->tiling != I915_TILING_NONE &&
+                    (bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
+                        uint32_t size;
+                        assert(bo->tiling == I915_TILING_X);
+                        if (kgem->nfence >= kgem->fence_max)
+                                return false;
+                        if (kgem->aperture_fenced) {
+                                size = 3*kgem->aperture_fenced;
+                                if (kgem->aperture_total == kgem->aperture_mappable)
+                                        size += kgem->aperture;
+                                if (size > kgem->aperture_mappable &&
+                                    kgem_ring_is_idle(kgem, kgem->ring)) {
+                                        DBG(("%s: opportunistic fence flush\n", __FUNCTION__));
+                                        return false;
+                                }
+                        }
+                        size = kgem_bo_fenced_size(kgem, bo);
+                        if (size > kgem->aperture_max_fence)
+                                kgem->aperture_max_fence = size;
+                        size += kgem->aperture_fenced;
+                        if (kgem->gen < 033)
+                                size += kgem->aperture_max_fence;
+                        if (kgem->aperture_total == kgem->aperture_mappable)
+                                size += kgem->aperture;
+                        if (size > kgem->aperture_mappable) {
+                                DBG(("%s: estimated fence space required [%d] exceed aperture [%d]\n",
+                                     __FUNCTION__, size, kgem->aperture_mappable));
+                                return false;
+                        }
+                }
+                return true;
+        }
+        if (kgem->nexec >= KGEM_EXEC_SIZE(kgem) - 1)
+                return false;
+        if (needs_semaphore(kgem, bo)) {
+                DBG(("%s: flushing for required semaphore\n", __FUNCTION__));
+                return false;
+        }
+        assert_tiling(kgem, bo);
+        if (kgem->gen < 040 && bo->tiling != I915_TILING_NONE) {
+                uint32_t size;
+                assert(bo->tiling == I915_TILING_X);
+                if (kgem->nfence >= kgem->fence_max)
+                        return false;
+                if (kgem->aperture_fenced) {
+                        size = 3*kgem->aperture_fenced;
+                        if (kgem->aperture_total == kgem->aperture_mappable)
+                                size += kgem->aperture;
+                        if (size > kgem->aperture_mappable &&
+                            kgem_ring_is_idle(kgem, kgem->ring)) {
+                                DBG(("%s: opportunistic fence flush\n", __FUNCTION__));
+                                return false;
+                        }
+                }
+                size = kgem_bo_fenced_size(kgem, bo);
+                if (size > kgem->aperture_max_fence)
+                        kgem->aperture_max_fence = size;
+                size += kgem->aperture_fenced;
+                if (kgem->gen < 033)
+                        size += kgem->aperture_max_fence;
+                if (kgem->aperture_total == kgem->aperture_mappable)
+                        size += kgem->aperture;
+                if (size > kgem->aperture_mappable) {
+                        DBG(("%s: estimated fence space required [%d] exceed aperture [%d]\n",
+                             __FUNCTION__, size, kgem->aperture_mappable));
+                        return false;
+                }
+        }
+        if (kgem->aperture + kgem->aperture_fenced + num_pages(bo) > kgem->aperture_high) {
+                DBG(("%s: final aperture usage (%d) is greater than high water mark (%d)\n",
+                     __FUNCTION__, num_pages(bo) + kgem->aperture, kgem->aperture_high));
+                if (!aperture_check(kgem, num_pages(bo) + kgem->aperture + kgem->aperture_fenced))
+                        return false;
+        }
+        if (bo->rq)
+                return true;
+        return kgem_flush(kgem, bo->flush);
+}
+#endif
 ,6 → 4254,10
 uint32_t kgem_add_reloc(struct kgem *kgem,
                         uint32_t pos,
                         struct kgem_bo *bo,
 ,23 → 4269,14
         DBG(("%s: handle=%d, pos=%d, delta=%d, domains=%08x\n",
              __FUNCTION__, bo ? bo->handle : 0, pos, delta, read_write_domain));
+        assert(kgem->gen < 0100);
         assert((read_write_domain & 0x7fff) == 0 || bo != NULL);
-    if( bo != NULL && bo->handle == -2)
-    {
-                if (bo->exec == NULL)
-                        kgem_add_bo(kgem, bo);
-                if (read_write_domain & 0x7fff && !bo->gpu_dirty) {
-                        __kgem_bo_mark_dirty(bo);
-                }
-        return 0;
-    };
         index = kgem->nreloc++;
         assert(index < ARRAY_SIZE(kgem->reloc));
         kgem->reloc[index].offset = pos * sizeof(kgem->batch[0]);
         if (bo) {
+                assert(kgem->mode != KGEM_NONE);
                 assert(bo->refcnt);
                 while (bo->proxy) {
                         DBG(("%s: adding proxy [delta=%d] for handle=%d\n",
 ,6 → 4290,7
                                 bo->rq = MAKE_REQUEST(kgem->next_request,
                                                       kgem->ring);
                                 bo->exec = &_kgem_dummy_exec;
+                                bo->domain = DOMAIN_GPU;
                 }
                         if (read_write_domain & 0x7fff && !bo->gpu_dirty)
 ,6 → 4309,7
                 if (kgem->gen < 040 && read_write_domain & KGEM_RELOC_FENCED) {
                         if (bo->tiling &&
                             (bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
+                                assert(bo->tiling == I915_TILING_X);
                                 assert(kgem->nfence < kgem->fence_max);
                                 kgem->aperture_fenced +=
                                         kgem_bo_fenced_size(kgem, bo);
 ,6 → 4341,77
         return delta;
 }
+uint64_t kgem_add_reloc64(struct kgem *kgem,
+                          uint32_t pos,
+                          struct kgem_bo *bo,
+                          uint32_t read_write_domain,
+                          uint64_t delta)
+{
+        int index;
+        DBG(("%s: handle=%d, pos=%d, delta=%ld, domains=%08x\n",
+             __FUNCTION__, bo ? bo->handle : 0, pos, (long)delta, read_write_domain));
+        assert(kgem->gen >= 0100);
+        assert((read_write_domain & 0x7fff) == 0 || bo != NULL);
+        index = kgem->nreloc++;
+        assert(index < ARRAY_SIZE(kgem->reloc));
+        kgem->reloc[index].offset = pos * sizeof(kgem->batch[0]);
+        if (bo) {
+                assert(kgem->mode != KGEM_NONE);
+                assert(bo->refcnt);
+                while (bo->proxy) {
+                        DBG(("%s: adding proxy [delta=%ld] for handle=%d\n",
+                             __FUNCTION__, (long)bo->delta, bo->handle));
+                        delta += bo->delta;
+                        assert(bo->handle == bo->proxy->handle);
+                        /* need to release the cache upon batch submit */
+                        if (bo->exec == NULL) {
+                                list_move_tail(&bo->request,
+                                               &kgem->next_request->buffers);
+                                bo->rq = MAKE_REQUEST(kgem->next_request,
+                                                      kgem->ring);
+                                bo->exec = &_kgem_dummy_exec;
+                                bo->domain = DOMAIN_GPU;
+                        }
+                        if (read_write_domain & 0x7fff && !bo->gpu_dirty)
+                                __kgem_bo_mark_dirty(bo);
+                        bo = bo->proxy;
+                        assert(bo->refcnt);
+                }
+                assert(bo->refcnt);
+                if (bo->exec == NULL)
+                        kgem_add_bo(kgem, bo);
+                assert(bo->rq == MAKE_REQUEST(kgem->next_request, kgem->ring));
+                assert(RQ_RING(bo->rq) == kgem->ring);
+                kgem->reloc[index].delta = delta;
+                kgem->reloc[index].target_handle = bo->target_handle;
+                kgem->reloc[index].presumed_offset = bo->presumed_offset;
+                if (read_write_domain & 0x7fff && !bo->gpu_dirty) {
+                        assert(!bo->snoop || kgem->can_blt_cpu);
+                        __kgem_bo_mark_dirty(bo);
+                }
+                delta += bo->presumed_offset;
+        } else {
+                kgem->reloc[index].delta = delta;
+                kgem->reloc[index].target_handle = ~0U;
+                kgem->reloc[index].presumed_offset = 0;
+                if (kgem->nreloc__self < 256)
+                        kgem->reloc__self[kgem->nreloc__self++] = index;
+        }
+        kgem->reloc[index].read_domains = read_write_domain >> 16;
+        kgem->reloc[index].write_domain = read_write_domain & 0x7fff;
+        return delta;
+}
 static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 {
         int i, j;
 ,6 → 4434,7
         i = 0;
         while (kgem->vma[type].count > 0) {
                 struct kgem_bo *bo = NULL;
+                void **ptr;
                 for (j = 0;
                      bo == NULL && j < ARRAY_SIZE(kgem->vma[type].inactive);
 ,15 → 4447,14
                         break;
                 DBG(("%s: discarding inactive %s vma cache for %d\n",
-                     __FUNCTION__,
-                     IS_CPU_MAP(bo->map) ? "CPU" : "GTT", bo->handle));
-                assert(IS_CPU_MAP(bo->map) == type);
-                assert(bo->map);
+                     __FUNCTION__, type ? "CPU" : "GTT", bo->handle));
+                ptr = type ? &bo->map__cpu : &bo->map__gtt;
                         assert(bo->rq == NULL);
-                VG(if (type) VALGRIND_MAKE_MEM_NOACCESS(MAP(bo->map), bytes(bo)));
-//              munmap(MAP(bo->map), bytes(bo));
-                bo->map = NULL;
+                VG(if (type) VALGRIND_MAKE_MEM_NOACCESS(MAP(*ptr), bytes(bo)));
+//              munmap(MAP(*ptr), bytes(bo));
+                *ptr = NULL;
                 list_del(&bo->vma);
                 kgem->vma[type].count--;
 ,12 → 4470,11
 {
         void *ptr;
-        DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
-             bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
+        DBG(("%s: handle=%d, offset=%ld, tiling=%d, map=%p:%p, domain=%d\n", __FUNCTION__,
+             bo->handle, (long)bo->presumed_offset, bo->tiling, bo->map__gtt, bo->map__cpu, bo->domain));
         assert(bo->proxy == NULL);
         assert(list_is_empty(&bo->list));
-        assert(!IS_USER_MAP(bo->map));
         assert_tiling(kgem, bo);
         if (bo->tiling == I915_TILING_NONE && !bo->scanout && kgem->has_llc) {
 ,12 → 4483,9
                 return kgem_bo_map__cpu(kgem, bo);
         }
-        if (IS_CPU_MAP(bo->map))
-                kgem_bo_release_map(kgem, bo);
-        ptr = bo->map;
+        ptr = MAP(bo->map__gtt);
         if (ptr == NULL) {
-                assert(kgem_bo_size(bo) <= kgem->aperture_mappable / 2);
+                assert(num_pages(bo) <= kgem->aperture_mappable / 2);
                 kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 ,7 → 4498,7
                  * issue with compositing managers which need to frequently
                  * flush CPU damage to their GPU bo.
                  */
-                bo->map = ptr;
+                bo->map__gtt = ptr;
                 DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
         }
 ,12 → 4509,11
 {
         void *ptr;
-        DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
-             bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
+        DBG(("%s: handle=%d, offset=%ld, tiling=%d, map=%p:%p, domain=%d\n", __FUNCTION__,
+             bo->handle, (long)bo->presumed_offset, bo->tiling, bo->map__gtt, bo->map__cpu, bo->domain));
         assert(bo->proxy == NULL);
         assert(list_is_empty(&bo->list));
-        assert(!IS_USER_MAP(bo->map));
         assert(bo->exec == NULL);
         assert_tiling(kgem, bo);
 ,12 → 4527,9
                 return ptr;
         }
-        if (IS_CPU_MAP(bo->map))
-                kgem_bo_release_map(kgem, bo);
-        ptr = bo->map;
+        ptr = MAP(bo->map__gtt);
         if (ptr == NULL) {
-                assert(kgem_bo_size(bo) <= kgem->aperture_mappable / 2);
+                assert(num_pages(bo) <= kgem->aperture_mappable / 2);
                 assert(kgem->gen != 021 || bo->tiling != I915_TILING_Y);
                 kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 ,7 → 4543,7
                  * issue with compositing managers which need to frequently
                  * flush CPU damage to their GPU bo.
                  */
-                bo->map = ptr;
+                bo->map__gtt = ptr;
                 DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
                 }
 ,20 → 4573,16
 {
         void *ptr;
-        DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
-             bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
+        DBG(("%s: handle=%d, offset=%ld, tiling=%d, map=%p:%p, domain=%d\n", __FUNCTION__,
+             bo->handle, (long)bo->presumed_offset, bo->tiling, bo->map__gtt, bo->map__cpu, bo->domain));
         assert(bo->exec == NULL);
         assert(list_is_empty(&bo->list));
-        assert(!IS_USER_MAP(bo->map));
         assert_tiling(kgem, bo);
-        if (IS_CPU_MAP(bo->map))
-                kgem_bo_release_map(kgem, bo);
-        ptr = bo->map;
+        ptr = MAP(bo->map__gtt);
         if (ptr == NULL) {
-                assert(bytes(bo) <= kgem->aperture_mappable / 4);
+                assert(num_pages(bo) <= kgem->aperture_mappable / 4);
                 kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 ,7 → 4595,7
                  * issue with compositing managers which need to frequently
                  * flush CPU damage to their GPU bo.
                  */
-                bo->map = ptr;
+                bo->map__gtt = ptr;
                 DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
         }
 ,11 → 4604,7
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo)
 {
-        if (bo->map)
-                return MAP(bo->map);
-        kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
-        return bo->map = __kgem_bo_map__gtt(kgem, bo);
+        return kgem_bo_map__async(kgem, bo);
 }
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 ,18 → 4611,15
 {
         struct drm_i915_gem_mmap mmap_arg;
-        DBG(("%s(handle=%d, size=%d, mapped? %d)\n",
-             __FUNCTION__, bo->handle, bytes(bo), (int)__MAP_TYPE(bo->map)));
+        DBG(("%s(handle=%d, size=%d, map=%p:%p)\n",
+             __FUNCTION__, bo->handle, bytes(bo), bo->map__gtt, bo->map__cpu));
         assert(!bo->purged);
         assert(list_is_empty(&bo->list));
         assert(bo->proxy == NULL);
-        if (IS_CPU_MAP(bo->map))
-                return MAP(bo->map);
+        if (bo->map__cpu)
+                return MAP(bo->map__cpu);
-        if (bo->map)
-                kgem_bo_release_map(kgem, bo);
         kgem_trim_vma_cache(kgem, MAP_CPU, bucket(bo));
 retry:
 ,17 → 4628,17
         mmap_arg.offset = 0;
         mmap_arg.size = bytes(bo);
         if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
+                int err = 0;
                 if (__kgem_throttle_retire(kgem, 0))
                         goto retry;
-                if (kgem->need_expire) {
-                        kgem_cleanup_cache(kgem);
+                if (kgem_cleanup_cache(kgem))
                         goto retry;
-                }
-                ErrorF("%s: failed to mmap handle=%d, %d bytes, into CPU domain\n",
-                       __FUNCTION__, bo->handle, bytes(bo));
+                ErrorF("%s: failed to mmap handle=%d, %d bytes, into CPU domain: %d\n",
+                       __FUNCTION__, bo->handle, bytes(bo), err);
                 return NULL;
         }
 ,58 → 4645,66
         VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
         DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
-        bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
-        return (void *)(uintptr_t)mmap_arg.addr_ptr;
+        return bo->map__cpu = (void *)(uintptr_t)mmap_arg.addr_ptr;
 }
-void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
+/*
+struct kgem_bo *kgem_create_map(struct kgem *kgem,
+                                void *ptr, uint32_t size,
+                                bool read_only)
 {
-        struct drm_i915_gem_mmap mmap_arg;
+        struct kgem_bo *bo;
+        uintptr_t first_page, last_page;
+        uint32_t handle;
-        DBG(("%s(handle=%d, size=%d, mapped? %d)\n",
-             __FUNCTION__, bo->handle, bytes(bo), (int)__MAP_TYPE(bo->map)));
-        assert(bo->refcnt);
-        assert(!bo->purged);
-        assert(list_is_empty(&bo->list));
-        assert(bo->proxy == NULL);
+        assert(MAP(ptr) == ptr);
-        if (IS_CPU_MAP(bo->map))
-                return MAP(bo->map);
+        if (!kgem->has_userptr)
+                return NULL;
-retry:
-        VG_CLEAR(mmap_arg);
-        mmap_arg.handle = bo->handle;
-        mmap_arg.offset = 0;
-        mmap_arg.size = bytes(bo);
-        if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
-                int err = errno;
+        first_page = (uintptr_t)ptr;
+        last_page = first_page + size + PAGE_SIZE - 1;
-                assert(err != EINVAL);
+        first_page &= ~(PAGE_SIZE-1);
+        last_page &= ~(PAGE_SIZE-1);
+        assert(last_page > first_page);
-                if (__kgem_throttle_retire(kgem, 0))
-                        goto retry;
+        handle = gem_userptr(kgem->fd,
+                             (void *)first_page, last_page-first_page,
+                             read_only);
+        if (handle == 0)
+                return NULL;
-                if (kgem->need_expire) {
-                        kgem_cleanup_cache(kgem);
-                        goto retry;
-                }
+        bo = __kgem_bo_alloc(handle, (last_page - first_page) / PAGE_SIZE);
+        if (bo == NULL) {
+                gem_close(kgem->fd, handle);
+                return NULL;
+        }
-                ErrorF("%s: failed to mmap handle=%d, %d bytes, into CPU domain: %d\n",
-                       __FUNCTION__, bo->handle, bytes(bo), err);
+        bo->snoop = !kgem->has_llc;
+        debug_alloc__bo(kgem, bo);
+        if (first_page != (uintptr_t)ptr) {
+                struct kgem_bo *proxy;
+                proxy = kgem_create_proxy(kgem, bo,
+                                          (uintptr_t)ptr - first_page, size);
+                kgem_bo_destroy(kgem, bo);
+                if (proxy == NULL)
                 return NULL;
+                bo = proxy;
         }
-        VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
-        if (bo->map && bo->domain == DOMAIN_CPU) {
-                DBG(("%s: discarding GTT vma for %d\n", __FUNCTION__, bo->handle));
-                kgem_bo_release_map(kgem, bo);
-        }
-        if (bo->map == NULL) {
-                DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
-                bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
-        }
-        return (void *)(uintptr_t)mmap_arg.addr_ptr;
+        bo->map__cpu = MAKE_USER_MAP(ptr);
+        DBG(("%s(ptr=%p, size=%d, pages=%d, read_only=%d) => handle=%d (proxy? %d)\n",
+             __FUNCTION__, ptr, size, NUM_PAGES(size), read_only, handle, bo->proxy != NULL));
+        return bo;
 }
+*/
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 {
         DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
 ,6 → 4737,72
         }
 }
+void kgem_bo_sync__cpu_full(struct kgem *kgem, struct kgem_bo *bo, bool write)
+{
+        DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
+        assert(!bo->scanout || !write);
+        if (write || bo->needs_flush)
+                kgem_bo_submit(kgem, bo);
+        /* SHM pixmaps use proxies for subpage offsets */
+        assert(!bo->purged);
+        assert(bo->refcnt);
+        while (bo->proxy)
+                bo = bo->proxy;
+        assert(bo->refcnt);
+        assert(!bo->purged);
+        if (bo->domain != DOMAIN_CPU || FORCE_MMAP_SYNC & (1 << DOMAIN_CPU)) {
+                struct drm_i915_gem_set_domain set_domain;
+                DBG(("%s: SYNC: handle=%d, needs_flush? %d, domain? %d, busy? %d\n",
+                     __FUNCTION__, bo->handle,
+                     bo->needs_flush, bo->domain,
+                     __kgem_busy(kgem, bo->handle)));
+                VG_CLEAR(set_domain);
+                set_domain.handle = bo->handle;
+                set_domain.read_domains = I915_GEM_DOMAIN_CPU;
+                set_domain.write_domain = write ? I915_GEM_DOMAIN_CPU : 0;
+                if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+                        if (bo->exec == NULL)
+                                kgem_bo_retire(kgem, bo);
+                        bo->domain = write ? DOMAIN_CPU : DOMAIN_NONE;
+                }
+        }
+}
+void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo)
+{
+        DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
+        assert(bo->refcnt);
+        assert(bo->proxy == NULL);
+        kgem_bo_submit(kgem, bo);
+        if (bo->domain != DOMAIN_GTT || FORCE_MMAP_SYNC & (1 << DOMAIN_GTT)) {
+                struct drm_i915_gem_set_domain set_domain;
+                DBG(("%s: SYNC: handle=%d, needs_flush? %d, domain? %d, busy? %d\n",
+                     __FUNCTION__, bo->handle,
+                     bo->needs_flush, bo->domain,
+                     __kgem_busy(kgem, bo->handle)));
+                VG_CLEAR(set_domain);
+                set_domain.handle = bo->handle;
+                set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+                set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+                if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+                        kgem_bo_retire(kgem, bo);
+                        bo->domain = DOMAIN_GTT;
+                        bo->gtt_dirty = true;
+                }
+        }
+}
 void kgem_clear_dirty(struct kgem *kgem)
 {
         struct list * const buffers = &kgem->next_request->buffers;
 ,7 → 4845,7
         bo->proxy = kgem_bo_reference(target);
         bo->delta = offset;
-        if (target->exec) {
+        if (target->exec && !bo->io) {
                 list_move_tail(&bo->request, &kgem->next_request->buffers);
                 bo->exec = &_kgem_dummy_exec;
         }
 ,7 → 4866,7
         bo->mem = NULL;
         bo->need_io = false;
-        bo->mmapped = true;
+        bo->mmapped = MMAPPED_CPU;
         return bo;
 }
 ,7 → 4941,7
                 assert(bo->base.snoop);
                 assert(bo->base.tiling == I915_TILING_NONE);
                 assert(num_pages(&bo->base) >= alloc);
-                assert(bo->mmapped == true);
+                assert(bo->mmapped == MMAPPED_CPU);
                 assert(bo->need_io == false);
                 bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
 ,7 → 4988,7
                 }
                 assert(bo->base.refcnt == 1);
-                assert(bo->mmapped == true);
+                assert(bo->mmapped == MMAPPED_CPU);
                 assert(bo->need_io == false);
                 bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
 ,7 → 5024,7
                 }
                 assert(bo->base.refcnt == 1);
-                assert(bo->mmapped == true);
+                assert(bo->mmapped == MMAPPED_CPU);
                 assert(bo->need_io == false);
                 if (!gem_set_caching(kgem->fd, bo->base.handle, SNOOPED))
 ,12 → 5066,12
                 DBG(("%s: created snoop handle=%d for buffer\n",
                      __FUNCTION__, bo->base.handle));
-                assert(bo->mmapped == true);
+                assert(bo->mmapped == MMAPPED_CPU);
                 assert(bo->need_io == false);
                 bo->base.refcnt = 1;
                 bo->base.snoop = true;
-                bo->base.map = MAKE_USER_MAP(bo->mem);
+                bo->base.map__cpu = MAKE_USER_MAP(bo->mem);
                 return bo;
         }
 ,11 → 5104,12
                 /* We can reuse any write buffer which we can fit */
                 if (flags == KGEM_BUFFER_LAST &&
                     bo->write == KGEM_BUFFER_WRITE &&
-                    bo->base.refcnt == 1 && !bo->mmapped &&
+                    bo->base.refcnt == 1 &&
+                    bo->mmapped == MMAPPED_NONE &&
                     size <= bytes(&bo->base)) {
                         DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
                              __FUNCTION__, size, bo->used, bytes(&bo->base)));
-                        gem_write(kgem->fd, bo->base.handle,
+                        gem_write__cachealigned(kgem->fd, bo->base.handle,
 , bo->used, bo->mem);
                         kgem_buffer_release(kgem, bo);
                         bo->need_io = 0;
 ,10 → 5149,11
                 list_for_each_entry(bo, &kgem->active_buffers, base.list) {
                         assert(bo->base.io);
                         assert(bo->base.refcnt >= 1);
+                        assert(bo->base.exec == NULL);
                         assert(bo->mmapped);
-                        assert(!IS_CPU_MAP(bo->base.map) || kgem->has_llc || bo->base.snoop);
+                        assert(bo->mmapped == MMAPPED_GTT || kgem->has_llc || bo->base.snoop);
-                        if (!kgem->has_llc && (bo->write & ~flags) & KGEM_BUFFER_INPLACE) {
+                        if ((bo->write & ~flags) & KGEM_BUFFER_INPLACE && !bo->base.snoop) {
                                 DBG(("%s: skip write %x buffer, need %x\n",
                                      __FUNCTION__, bo->write, flags));
                                 continue;
 ,6 → 5167,29
                                 list_move(&bo->base.list, &kgem->batch_buffers);
                                 goto done;
                         }
+                        if (size <= bytes(&bo->base) &&
+                            (bo->base.rq == NULL ||
+                             !__kgem_busy(kgem, bo->base.handle))) {
+                                DBG(("%s: reusing whole buffer? size=%d, total=%d\n",
+                                     __FUNCTION__, size, bytes(&bo->base)));
+                                __kgem_bo_clear_busy(&bo->base);
+                                kgem_buffer_release(kgem, bo);
+                                switch (bo->mmapped) {
+                                case MMAPPED_CPU:
+                                        kgem_bo_sync__cpu(kgem, &bo->base);
+                                        break;
+                                case MMAPPED_GTT:
+                                        kgem_bo_sync__gtt(kgem, &bo->base);
+                                        break;
+                                }
+                                offset = 0;
+                                bo->used = size;
+                                list_move(&bo->base.list, &kgem->batch_buffers);
+                                goto done;
+                        }
                 }
         }
 #endif
 ,9 → 5203,9
                 alloc = PAGE_ALIGN(size);
         assert(alloc);
+        alloc /= PAGE_SIZE;
         if (alloc > kgem->aperture_mappable / 4)
                 flags &= ~KGEM_BUFFER_INPLACE;
-        alloc /= PAGE_SIZE;
         if (kgem->has_llc &&
             (flags & KGEM_BUFFER_WRITE_INPLACE) != KGEM_BUFFER_WRITE_INPLACE) {
 ,7 → 5291,7
                                                   CREATE_EXACT | CREATE_INACTIVE | CREATE_GTT_MAP);
                 if (old == NULL) {
                         old = search_linear_cache(kgem, alloc, CREATE_INACTIVE);
-                        if (old && !__kgem_bo_is_mappable(kgem, old)) {
+                        if (old && !kgem_bo_can_map(kgem, old)) {
                                 _kgem_bo_destroy(kgem, old);
                                 old = NULL;
                         }
 ,7 → 5299,7
                 if (old) {
                         DBG(("%s: reusing handle=%d for buffer\n",
                              __FUNCTION__, old->handle));
-                        assert(__kgem_bo_is_mappable(kgem, old));
+                        assert(kgem_bo_can_map(kgem, old));
                         assert(!old->snoop);
                         assert(old->rq == NULL);
 ,8 → 5315,10
                         bo->mem = kgem_bo_map(kgem, &bo->base);
                         if (bo->mem) {
-                                if (IS_CPU_MAP(bo->base.map))
+                                if (bo->mem == MAP(bo->base.map__cpu))
                                         flags &= ~KGEM_BUFFER_INPLACE;
+                                else
+                                        bo->mmapped = MMAPPED_GTT;
                                 goto init;
                         } else {
                                 bo->base.refcnt = 0;
 ,7 → 5437,8
         assert(!bo->need_io || !bo->base.needs_flush);
         assert(!bo->need_io || bo->base.domain != DOMAIN_GPU);
         assert(bo->mem);
-        assert(!bo->mmapped || bo->base.map != NULL);
+        assert(bo->mmapped != MMAPPED_GTT || MAP(bo->base.map__gtt) == bo->mem);
+        assert(bo->mmapped != MMAPPED_CPU || MAP(bo->base.map__cpu) == bo->mem);
         bo->used = size;
         bo->write = flags & KGEM_BUFFER_WRITE_INPLACE;
 ,6 → 5452,7
 done:
         bo->used = ALIGN(bo->used, UPLOAD_ALIGNMENT);
+        assert(bo->used && bo->used <= bytes(&bo->base));
         assert(bo->mem);
         *ret = (char *)bo->mem + offset;
         return kgem_create_proxy(kgem, &bo->base, offset, size);
 ,7 → 5509,7
                 bo->size.bytes -= stride;
         }
-        bo->map = MAKE_CPU_MAP(*ret);
+        bo->map__cpu = *ret;
         bo->pitch = stride;
         bo->unique_id = kgem_get_unique_id(kgem);
         return bo;
 ,10 → 5554,10
                           struct kgem_bo **ptr)
 {
         DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
-        assert(bo->map == NULL || IS_CPU_MAP(bo->map));
+        assert(bo->map__gtt == NULL);
         assert(bo->proxy);
         list_add(&bo->vma, &bo->proxy->vma);
-        bo->map = ptr;
+        bo->map__gtt = ptr;
         *ptr = kgem_bo_reference(bo);
 }
 ,13 → 5590,13
                      bo->base.domain,
                      __kgem_busy(kgem, bo->base.handle)));
-                assert(!IS_CPU_MAP(bo->base.map) || bo->base.snoop || kgem->has_llc);
+                assert(bo->mmapped == MMAPPED_GTT || bo->base.snoop || kgem->has_llc);
                 VG_CLEAR(set_domain);
                 set_domain.handle = bo->base.handle;
                 set_domain.write_domain = 0;
                 set_domain.read_domains =
-                        IS_CPU_MAP(bo->base.map) ? I915_GEM_DOMAIN_CPU : I915_GEM_DOMAIN_GTT;
+                        bo->mmapped == MMAPPED_CPU ? I915_GEM_DOMAIN_CPU : I915_GEM_DOMAIN_GTT;
                 if (drmIoctl(kgem->fd,
                              DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain))

 /contrib/sdk/sources/Intel-2D/sna/kgem.h
 ,9 → 71,8
         struct list request;
         struct list vma;
-    void     *map;
-#define IS_CPU_MAP(ptr) ((uintptr_t)(ptr) & 1)
-#define IS_GTT_MAP(ptr) (ptr && ((uintptr_t)(ptr) & 1) == 0)
+        void *map__cpu;
+        void *map__gtt;
 #define MAP(ptr) ((void*)((uintptr_t)(ptr) & ~3))
         struct kgem_bo_binding {
 ,11 → 81,11
                 uint16_t offset;
         } binding;
+        uint64_t presumed_offset;
         uint32_t unique_id;
         uint32_t refcnt;
         uint32_t handle;
         uint32_t target_handle;
-        uint32_t presumed_offset;
         uint32_t delta;
         union {
                 struct {
 ,11 → 199,12
         uint32_t has_handle_lut :1;
         uint32_t can_blt_cpu :1;
+        uint32_t can_render_y :1;
         uint16_t fence_max;
         uint16_t half_cpu_cache_pages;
         uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
-        uint32_t aperture, aperture_fenced;
+        uint32_t aperture, aperture_fenced, aperture_max_fence;
         uint32_t max_upload_tile_size, max_copy_tile_size;
         uint32_t max_gpu_size, max_cpu_size;
         uint32_t large_object_size, max_object_size;
 ,6 → 313,8
                                    int bpp,
                                    uint32_t flags);
+bool kgem_bo_convert_to_gpu(struct kgem *kgem, struct kgem_bo *bo);
 uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format);
 void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset);
 ,14 → 351,6
                 _kgem_submit(kgem);
 }
-static inline bool kgem_flush(struct kgem *kgem, bool flush)
-{
-        if (kgem->nreloc == 0)
-                return false;
-        return (kgem->flush ^ flush) && kgem_ring_is_idle(kgem, kgem->ring);
-}
 static inline void kgem_bo_submit(struct kgem *kgem, struct kgem_bo *bo)
 {
         if (bo->exec)
 ,8 → 386,10
         kgem_submit(kgem);
 #endif
-        if (kgem->nreloc && bo->exec == NULL && kgem_ring_is_idle(kgem, kgem->ring))
+        if (kgem->nreloc && bo->exec == NULL && kgem_ring_is_idle(kgem, kgem->ring)) {
+                DBG(("%s: flushing before new bo\n", __FUNCTION__));
                 _kgem_submit(kgem);
+        }
         if (kgem->mode == mode)
                 return;
 ,6 → 462,11
                         struct kgem_bo *bo,
                         uint32_t read_write_domains,
                         uint32_t delta);
+uint64_t kgem_add_reloc64(struct kgem *kgem,
+                          uint32_t pos,
+                          struct kgem_bo *bo,
+                          uint32_t read_write_domains,
+                          uint64_t delta);
 void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__async(struct kgem *kgem, struct kgem_bo *bo);
 ,8 → 476,6
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu_full(struct kgem *kgem, struct kgem_bo *bo, bool write);
-void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
-void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr);
 uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo);
 bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 ,7 → 482,7
                    const void *data, int length);
 int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo);
-void kgem_get_tile_size(struct kgem *kgem, int tiling,
+void kgem_get_tile_size(struct kgem *kgem, int tiling, int pitch,
                         int *tile_width, int *tile_height, int *tile_size);
 static inline int __kgem_buffer_size(struct kgem_bo *bo)
 ,6 → 497,12
         return PAGE_SIZE * bo->size.pages.count;
 }
+static inline int __kgem_bo_num_pages(struct kgem_bo *bo)
+{
+        assert(bo->proxy == NULL);
+        return bo->size.pages.count;
+}
 static inline int kgem_bo_size(struct kgem_bo *bo)
 {
         if (bo->proxy)
 ,7 → 511,6
                 return __kgem_bo_size(bo);
 }
-/*
 static inline bool kgem_bo_blt_pitch_is_ok(struct kgem *kgem,
                                            struct kgem_bo *bo)
 {
 ,81 → 537,7
         return kgem_bo_blt_pitch_is_ok(kgem, bo);
 }
-*/
-static inline bool __kgem_bo_is_mappable(struct kgem *kgem,
-                                       struct kgem_bo *bo)
-{
-        if (bo->domain == DOMAIN_GTT)
-                return true;
-        if (kgem->gen < 040 && bo->tiling &&
-            bo->presumed_offset & (kgem_bo_fenced_size(kgem, bo) - 1))
-                return false;
-        if (kgem->gen == 021 && bo->tiling == I915_TILING_Y)
-                return false;
-        if (kgem->has_llc && bo->tiling == I915_TILING_NONE)
-                return true;
-        if (!bo->presumed_offset)
-                return kgem_bo_size(bo) <= kgem->aperture_mappable / 4;
-        return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable;
-}
-static inline bool kgem_bo_is_mappable(struct kgem *kgem,
-                                       struct kgem_bo *bo)
-{
-        DBG(("%s: domain=%d, offset: %d size: %d\n",
-             __FUNCTION__, bo->domain, bo->presumed_offset, kgem_bo_size(bo)));
-        assert(bo->refcnt);
-        return __kgem_bo_is_mappable(kgem, bo);
-}
-static inline bool kgem_bo_mapped(struct kgem *kgem, struct kgem_bo *bo)
-{
-        DBG(("%s: map=%p, tiling=%d, domain=%d\n",
-             __FUNCTION__, bo->map, bo->tiling, bo->domain));
-        assert(bo->refcnt);
-        if (bo->map == NULL)
-                return bo->tiling == I915_TILING_NONE && bo->domain == DOMAIN_CPU;
-        return IS_CPU_MAP(bo->map) == !bo->tiling;
-}
-static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo)
-{
-        if (kgem_bo_mapped(kgem, bo))
-                return true;
-        if (!bo->tiling && (kgem->has_llc || bo->domain == DOMAIN_CPU))
-                return true;
-        if (kgem->gen == 021 && bo->tiling == I915_TILING_Y)
-                return false;
-        return kgem_bo_size(bo) <= kgem->aperture_mappable / 4;
-}
-static inline bool kgem_bo_can_map__cpu(struct kgem *kgem,
-                                        struct kgem_bo *bo,
-                                        bool write)
-{
-        if (bo->purged || (bo->scanout && write))
-                return false;
-        if (kgem->has_llc)
-                return true;
-        if (bo->domain != DOMAIN_CPU)
-                return false;
-        return !write || bo->exec == NULL;
-}
 static inline bool kgem_bo_is_snoop(struct kgem_bo *bo)
 {
         assert(bo->refcnt);
 ,9 → 582,6
         if (bo->exec)
                 return true;
-        if (kgem_flush(kgem, bo->flush))
-                kgem_submit(kgem);
         if (bo->rq && !__kgem_busy(kgem, bo->handle))
                 __kgem_bo_clear_busy(bo);
 ,6 → 650,53
         } while ((bo = bo->proxy));
 }
+static inline bool kgem_bo_mapped(struct kgem *kgem, struct kgem_bo *bo)
+{
+        DBG(("%s: map=%p:%p, tiling=%d, domain=%d\n",
+             __FUNCTION__, bo->map__gtt, bo->map__cpu, bo->tiling, bo->domain));
+        if (bo->tiling == I915_TILING_NONE && (bo->domain == DOMAIN_CPU || kgem->has_llc))
+                return bo->map__cpu != NULL;
+        return bo->map__gtt != NULL;
+}
+static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo)
+{
+        DBG(("%s: map=%p:%p, tiling=%d, domain=%d, offset=%ld\n",
+             __FUNCTION__, bo->map__gtt, bo->map__cpu, bo->tiling, bo->domain, (long)bo->presumed_offset));
+        if (!bo->tiling && (kgem->has_llc || bo->domain == DOMAIN_CPU))
+                return true;
+        if (bo->map__gtt != NULL)
+                return true;
+        if (kgem->gen == 021 && bo->tiling == I915_TILING_Y)
+                return false;
+        if (!bo->presumed_offset)
+                return __kgem_bo_num_pages(bo) <= kgem->aperture_mappable / 4;
+        return bo->presumed_offset / PAGE_SIZE + __kgem_bo_num_pages(bo) <= kgem->aperture_mappable;
+}
+static inline bool kgem_bo_can_map__cpu(struct kgem *kgem,
+                                        struct kgem_bo *bo,
+                                        bool write)
+{
+        if (bo->purged || (bo->scanout && write))
+                return false;
+        if (kgem->has_llc)
+                return true;
+        if (bo->domain != DOMAIN_CPU)
+                return false;
+        return !write || bo->exec == NULL;
+}
 #define KGEM_BUFFER_WRITE       0x1
 #define KGEM_BUFFER_INPLACE     0x2
 #define KGEM_BUFFER_LAST        0x4
 ,8 → 716,7
 void kgem_throttle(struct kgem *kgem);
 #define MAX_INACTIVE_TIME 10
 bool kgem_expire_cache(struct kgem *kgem);
-void kgem_purge_cache(struct kgem *kgem);
-void kgem_cleanup_cache(struct kgem *kgem);
+bool kgem_cleanup_cache(struct kgem *kgem);
 void kgem_clean_scanout_cache(struct kgem *kgem);
 void kgem_clean_large_cache(struct kgem *kgem);
 ,4 → 731,6
 }
 #endif
+void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling);
 #endif /* KGEM_H */

 /contrib/sdk/sources/Intel-2D/sna/sna.c
 ,7 → 706,20
+int intel_get_device_id(struct sna *sna)
+{
+    struct drm_i915_getparam gp;
+    int devid = 0;
+    memset(&gp, 0, sizeof(gp));
+    gp.param = I915_PARAM_CHIPSET_ID;
+    gp.value = &devid;
+    if (drmIoctl(sna->scrn, DRM_IOCTL_I915_GETPARAM, &gp))
+        return 0;
+    return devid;
+}
 static const struct intel_device_info intel_generic_info = {
         .gen = -1,
 };
 ,21 → 827,6
         return &intel_generic_info;
 }
-int intel_get_device_id(int fd)
-{
-        struct drm_i915_getparam gp;
-        int devid = 0;
-        memset(&gp, 0, sizeof(gp));
-        gp.param = I915_PARAM_CHIPSET_ID;
-        gp.value = &devid;
-        if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
-                return 0;
-        return devid;
-}
 int drmIoctl(int fd, unsigned long request, void *arg)
 {
     ioctl_t  io;

 /contrib/sdk/sources/Intel-2D/sna/sna.h
 ,7 → 448,8
     unsigned flags;
 #define SNA_NO_WAIT             0x1
 #define SNA_NO_FLIP             0x2
-#define SNA_TRIPLE_BUFFER       0x4
+#define SNA_NO_VSYNC            0x4
+#define SNA_TRIPLE_BUFFER       0x8
 #define SNA_TEAR_FREE           0x10
 #define SNA_FORCE_SHADOW        0x20
 #define SNA_FLUSH_GTT           0x40
 ,6 → 491,7
         uint32_t fill_alu;
     } blt_state;
     union {
+                unsigned gt;
         struct gen3_render_state gen3;
         struct gen4_render_state gen4;
         struct gen5_render_state gen5;
 ,6 → 499,8
                 struct gen7_render_state gen7;
     } render_state;
+        bool dri_available;
+        bool dri_open;
     /* Broken-out options. */
 //    OptionInfoPtr Options;
 ,4 → 615,7
         return memcmp(a, b, sizeof(*a)) == 0;
 }
+int intel_get_device_id(struct sna *sna);
 #endif /* _SNA_H */

 /contrib/sdk/sources/Intel-2D/sna/sna_reg.h
 ,22 → 42,22
 #define BLT_SRC_TILED           (1<<15)
 #define BLT_DST_TILED           (1<<11)
-#define COLOR_BLT_CMD                   ((2<<29)|(0x40<<22)|(0x3))
-#define XY_COLOR_BLT                    ((2<<29)|(0x50<<22)|(0x4))
-#define XY_SETUP_BLT                    ((2<<29)|(1<<22)|6)
-#define XY_SETUP_MONO_PATTERN_SL_BLT    ((2<<29)|(0x11<<22)|7)
-#define XY_SETUP_CLIP                   ((2<<29)|(3<<22)|1)
-#define XY_SCANLINE_BLT                 ((2<<29)|(0x25<<22)|1)
-#define XY_TEXT_IMMEDIATE_BLT           ((2<<29)|(0x31<<22)|(1<<16))
-#define XY_SRC_COPY_BLT_CMD             ((2<<29)|(0x53<<22)|6)
-#define SRC_COPY_BLT_CMD                ((2<<29)|(0x43<<22)|0x4)
-#define XY_PAT_BLT                      ((2<<29)|(0x51<<22)|0x4)
-#define XY_PAT_BLT_IMMEDIATE            ((2<<29)|(0x72<<22))
-#define XY_MONO_PAT                     ((0x2<<29)|(0x52<<22)|0x7)
-#define XY_MONO_SRC_COPY                ((0x2<<29)|(0x54<<22)|(0x6))
-#define XY_MONO_SRC_COPY_IMM            ((0x2<<29)|(0x71<<22))
-#define XY_FULL_MONO_PATTERN_BLT        ((0x2<<29)|(0x57<<22)|0xa)
-#define XY_FULL_MONO_PATTERN_MONO_SRC_BLT       ((0x2<<29)|(0x58<<22)|0xa)
+#define COLOR_BLT_CMD                   (2<<29|0x40<<22|(0x3))
+#define XY_COLOR_BLT                    (2<<29|0x50<<22|(0x4))
+#define XY_SETUP_BLT                    (2<<29|0x01<<22)
+#define XY_SETUP_MONO_PATTERN_SL_BLT    (2<<29|0x11<<22)
+#define XY_SETUP_CLIP                   (2<<29|0x03<<22|1)
+#define XY_SCANLINE_BLT                 (2<<29|0x25<<22|1)
+#define XY_TEXT_IMMEDIATE_BLT           (2<<29|0x31<<22|(1<<16))
+#define XY_SRC_COPY_BLT_CMD             (2<<29|0x53<<22)
+#define SRC_COPY_BLT_CMD                (2<<29|0x43<<22|0x4)
+#define XY_PAT_BLT                      (2<<29|0x51<<22)
+#define XY_PAT_BLT_IMMEDIATE            (2<<29|0x72<<22)
+#define XY_MONO_PAT                     (2<<29|0x52<<22)
+#define XY_MONO_SRC_COPY                (2<<29|0x54<<22)
+#define XY_MONO_SRC_COPY_IMM            (2<<29|0x71<<22)
+#define XY_FULL_MONO_PATTERN_BLT        (2<<29|0x57<<22)
+#define XY_FULL_MONO_PATTERN_MONO_SRC_BLT (2<<29|0x58<<22)
 /* FLUSH commands */
 #define BRW_3D(Pipeline,Opcode,Subopcode) \

/contrib/sdk/sources/Intel-2D/sna/sna_render.h
104,6 → 104,7
uint32_t inplace :1;
uint32_t overwrites:1;
uint32_t bpp : 6;
uint32_t alu : 4;

uint32_t cmd;
uint32_t br13;
245,7 → 246,7
struct sna_solid_cache {
struct kgem_bo *cache_bo;
struct kgem_bo *bo[1024];
uint32_t color[1025];
uint32_t color[1024];
int last;
int size;
int dirty;
381,6 → 382,7
};

struct gen6_render_state {
unsigned gt;
const struct gt_info *info;
struct kgem_bo *general_bo;

430,6 → 432,7
};

struct gen7_render_state {
unsigned gt;
const struct gt_info *info;
struct kgem_bo *general_bo;

 /contrib/sdk/sources/Intel-2D/uxa/uxa.c
 ,10 → 70,10
 //      intel_debug_flush(scrn);
 }
-int sna_bitmap_from_handle(bitmap_t *bitmap, uint32_t handle)
+int uxa_bitmap_from_handle(bitmap_t *bitmap, uint32_t handle)
 {
-        struct intel_screen_private *intel = intel_get_screen_private();
-        drm_intel_bo *bo;
+    struct intel_screen_private *intel = intel_get_screen_private();
+    drm_intel_bo *bo;
     surface_t    *sf;
     unsigned int size;
 ,14 → 118,14
     return -1;
 };
-void sna_set_bo_handle(bitmap_t *bitmap, int handle)
+void uxa_set_bo_handle(bitmap_t *bitmap, int handle)
 {
-    sna_bitmap_from_handle(bitmap, handle);
+    uxa_bitmap_from_handle(bitmap, handle);
 };
-int sna_blit_tex(bitmap_t *bitmap, bool scale, int dst_x, int dst_y,
-                  int w, int h, int src_x, int src_y)
+int uxa_blit_tex(bitmap_t *bitmap, int scale, int vsync,
+                 int dst_x, int dst_y,int w, int h, int src_x, int src_y)
 {
 //    DBG("%s\n", __FUNCTION__);

Subversion Repositories Kolibri OS

Compare Revisions

Ignore whitespace Rev 4500 → Rev 4501