diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c index d3c2639558d1..784585afac25 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c @@ -33,8 +33,10 @@ struct i915_vma *intel_emit_vma_fill_blt(struct intel_context *ce, size = (1 + 8 * count) * sizeof(u32); size = round_up(size, PAGE_SIZE); pool = intel_engine_pool_get(&ce->engine->pool, size); - if (IS_ERR(pool)) + if (IS_ERR(pool)) { + err = PTR_ERR(pool); goto out_pm; + } cmd = i915_gem_object_pin_map(pool->obj, I915_MAP_WC); if (IS_ERR(cmd)) { @@ -189,6 +191,207 @@ out_unpin: return err; } +struct i915_vma *intel_emit_vma_copy_blt(struct intel_context *ce, + struct i915_vma *src, + struct i915_vma *dst) +{ + struct drm_i915_private *i915 = ce->vm->i915; + const u32 block_size = S16_MAX * PAGE_SIZE; + struct intel_engine_pool_node *pool; + struct i915_vma *batch; + u64 src_offset, dst_offset; + u64 count, rem; + u32 size, *cmd; + int err; + + GEM_BUG_ON(src->size != dst->size); + + GEM_BUG_ON(intel_engine_is_virtual(ce->engine)); + intel_engine_pm_get(ce->engine); + + count = div_u64(dst->size, block_size); + size = (1 + 11 * count) * sizeof(u32); + size = round_up(size, PAGE_SIZE); + pool = intel_engine_pool_get(&ce->engine->pool, size); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + goto out_pm; + } + + cmd = i915_gem_object_pin_map(pool->obj, I915_MAP_WC); + if (IS_ERR(cmd)) { + err = PTR_ERR(cmd); + goto out_put; + } + + rem = src->size; + src_offset = src->node.start; + dst_offset = dst->node.start; + + do { + size = min_t(u64, rem, block_size); + GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX); + + if (INTEL_GEN(i915) >= 9) { + *cmd++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2); + *cmd++ = BLT_DEPTH_32 | PAGE_SIZE; + *cmd++ = 0; + *cmd++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; + *cmd++ = lower_32_bits(dst_offset); + *cmd++ = upper_32_bits(dst_offset); + *cmd++ = 0; + *cmd++ = PAGE_SIZE; + *cmd++ = lower_32_bits(src_offset); + *cmd++ = upper_32_bits(src_offset); + } else if (INTEL_GEN(i915) >= 8) { + *cmd++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2); + *cmd++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE; + *cmd++ = 0; + *cmd++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; + *cmd++ = lower_32_bits(dst_offset); + *cmd++ = upper_32_bits(dst_offset); + *cmd++ = 0; + *cmd++ = PAGE_SIZE; + *cmd++ = lower_32_bits(src_offset); + *cmd++ = upper_32_bits(src_offset); + } else { + *cmd++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); + *cmd++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE; + *cmd++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE; + *cmd++ = dst_offset; + *cmd++ = PAGE_SIZE; + *cmd++ = src_offset; + } + + /* Allow ourselves to be preempted in between blocks. */ + *cmd++ = MI_ARB_CHECK; + + src_offset += size; + dst_offset += size; + rem -= size; + } while (rem); + + *cmd = MI_BATCH_BUFFER_END; + intel_gt_chipset_flush(ce->vm->gt); + + i915_gem_object_unpin_map(pool->obj); + + batch = i915_vma_instance(pool->obj, ce->vm, NULL); + if (IS_ERR(batch)) { + err = PTR_ERR(batch); + goto out_put; + } + + err = i915_vma_pin(batch, 0, 0, PIN_USER); + if (unlikely(err)) + goto out_put; + + batch->private = pool; + return batch; + +out_put: + intel_engine_pool_put(pool); +out_pm: + intel_engine_pm_put(ce->engine); + return ERR_PTR(err); +} + +static int move_to_gpu(struct i915_vma *vma, struct i915_request *rq, bool write) +{ + struct drm_i915_gem_object *obj = vma->obj; + + if (obj->cache_dirty & ~obj->cache_coherent) + i915_gem_clflush_object(obj, 0); + + return i915_request_await_object(rq, obj, write); +} + +int i915_gem_object_copy_blt(struct drm_i915_gem_object *src, + struct drm_i915_gem_object *dst, + struct intel_context *ce) +{ + struct drm_gem_object *objs[] = { &src->base, &dst->base }; + struct i915_address_space *vm = ce->vm; + struct i915_vma *vma[2], *batch; + struct ww_acquire_ctx acquire; + struct i915_request *rq; + int err, i; + + vma[0] = i915_vma_instance(src, vm, NULL); + if (IS_ERR(vma[0])) + return PTR_ERR(vma[0]); + + err = i915_vma_pin(vma[0], 0, 0, PIN_USER); + if (unlikely(err)) + return err; + + vma[1] = i915_vma_instance(dst, vm, NULL); + if (IS_ERR(vma[1])) + goto out_unpin_src; + + err = i915_vma_pin(vma[1], 0, 0, PIN_USER); + if (unlikely(err)) + goto out_unpin_src; + + batch = intel_emit_vma_copy_blt(ce, vma[0], vma[1]); + if (IS_ERR(batch)) { + err = PTR_ERR(batch); + goto out_unpin_dst; + } + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_batch; + } + + err = intel_emit_vma_mark_active(batch, rq); + if (unlikely(err)) + goto out_request; + + err = drm_gem_lock_reservations(objs, ARRAY_SIZE(objs), &acquire); + if (unlikely(err)) + goto out_request; + + for (i = 0; i < ARRAY_SIZE(vma); i++) { + err = move_to_gpu(vma[i], rq, i); + if (unlikely(err)) + goto out_unlock; + } + + for (i = 0; i < ARRAY_SIZE(vma); i++) { + unsigned int flags = i ? EXEC_OBJECT_WRITE : 0; + + err = i915_vma_move_to_active(vma[i], rq, flags); + if (unlikely(err)) + goto out_unlock; + } + + if (rq->engine->emit_init_breadcrumb) { + err = rq->engine->emit_init_breadcrumb(rq); + if (unlikely(err)) + goto out_unlock; + } + + err = rq->engine->emit_bb_start(rq, + batch->node.start, batch->node.size, + 0); +out_unlock: + drm_gem_unlock_reservations(objs, ARRAY_SIZE(objs), &acquire); +out_request: + if (unlikely(err)) + i915_request_skip(rq, err); + + i915_request_add(rq); +out_batch: + intel_emit_vma_release(ce, batch); +out_unpin_dst: + i915_vma_unpin(vma[1]); +out_unpin_src: + i915_vma_unpin(vma[0]); + return err; +} + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftests/i915_gem_object_blt.c" #endif diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h index 9448b9192137..243a43a87824 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h @@ -19,6 +19,10 @@ struct i915_vma *intel_emit_vma_fill_blt(struct intel_context *ce, struct i915_vma *vma, u32 value); +struct i915_vma *intel_emit_vma_copy_blt(struct intel_context *ce, + struct i915_vma *src, + struct i915_vma *dst); + int intel_emit_vma_mark_active(struct i915_vma *vma, struct i915_request *rq); void intel_emit_vma_release(struct intel_context *ce, struct i915_vma *vma); @@ -26,4 +30,8 @@ int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj, struct intel_context *ce, u32 value); +int i915_gem_object_copy_blt(struct drm_i915_gem_object *src, + struct drm_i915_gem_object *dst, + struct intel_context *ce); + #endif diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_object_blt.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_object_blt.c index c6e1eebe53f5..c21d747e7d05 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_object_blt.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_object_blt.c @@ -103,10 +103,116 @@ err_flush: return err; } +static int igt_copy_blt(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_context *ce = i915->engine[BCS0]->kernel_context; + struct drm_i915_gem_object *src, *dst; + struct rnd_state prng; + IGT_TIMEOUT(end); + u32 *vaddr; + int err = 0; + + prandom_seed_state(&prng, i915_selftest.random_seed); + + do { + const u32 max_block_size = S16_MAX * PAGE_SIZE; + u32 sz = min_t(u64, ce->vm->total >> 4, prandom_u32_state(&prng)); + u32 phys_sz = sz % (max_block_size + 1); + u32 val = prandom_u32_state(&prng); + u32 i; + + sz = round_up(sz, PAGE_SIZE); + phys_sz = round_up(phys_sz, PAGE_SIZE); + + pr_debug("%s with phys_sz= %x, sz=%x, val=%x\n", __func__, + phys_sz, sz, val); + + src = huge_gem_object(i915, phys_sz, sz); + if (IS_ERR(src)) { + err = PTR_ERR(src); + goto err_flush; + } + + vaddr = i915_gem_object_pin_map(src, I915_MAP_WB); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_put_src; + } + + memset32(vaddr, val, + huge_gem_object_phys_size(src) / sizeof(u32)); + + i915_gem_object_unpin_map(src); + + if (!(src->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)) + src->cache_dirty = true; + + dst = huge_gem_object(i915, phys_sz, sz); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto err_put_src; + } + + vaddr = i915_gem_object_pin_map(dst, I915_MAP_WB); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_put_dst; + } + + memset32(vaddr, val ^ 0xdeadbeaf, + huge_gem_object_phys_size(dst) / sizeof(u32)); + + if (!(dst->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)) + dst->cache_dirty = true; + + mutex_lock(&i915->drm.struct_mutex); + err = i915_gem_object_copy_blt(src, dst, ce); + mutex_unlock(&i915->drm.struct_mutex); + if (err) + goto err_unpin; + + i915_gem_object_lock(dst); + err = i915_gem_object_set_to_cpu_domain(dst, false); + i915_gem_object_unlock(dst); + if (err) + goto err_unpin; + + for (i = 0; i < huge_gem_object_phys_size(dst) / sizeof(u32); ++i) { + if (vaddr[i] != val) { + pr_err("vaddr[%u]=%x, expected=%x\n", i, + vaddr[i], val); + err = -EINVAL; + goto err_unpin; + } + } + + i915_gem_object_unpin_map(dst); + + i915_gem_object_put(src); + i915_gem_object_put(dst); + } while (!time_after(jiffies, end)); + + goto err_flush; + +err_unpin: + i915_gem_object_unpin_map(dst); +err_put_dst: + i915_gem_object_put(dst); +err_put_src: + i915_gem_object_put(src); +err_flush: + if (err == -ENOMEM) + err = 0; + + return err; +} + int i915_gem_object_blt_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { SUBTEST(igt_fill_blt), + SUBTEST(igt_copy_blt), }; if (intel_gt_is_wedged(&i915->gt)) diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h index 69f34737325f..6a0879c27d14 100644 --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h @@ -186,11 +186,12 @@ #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3)) #define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2) -#define COLOR_BLT_CMD (2<<29 | 0x40<<22 | (5-2)) +#define COLOR_BLT_CMD (2 << 29 | 0x40 << 22 | (5 - 2)) #define XY_COLOR_BLT_CMD (2 << 29 | 0x50 << 22) -#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) -#define XY_SRC_COPY_BLT_CMD ((2<<29)|(0x53<<22)|6) -#define XY_MONO_SRC_COPY_IMM_BLT ((2<<29)|(0x71<<22)|5) +#define SRC_COPY_BLT_CMD (2 << 29 | 0x43 << 22) +#define GEN9_XY_FAST_COPY_BLT_CMD (2 << 29 | 0x42 << 22) +#define XY_SRC_COPY_BLT_CMD (2 << 29 | 0x53 << 22) +#define XY_MONO_SRC_COPY_IMM_BLT (2 << 29 | 0x71 << 22 | 5) #define BLT_WRITE_A (2<<20) #define BLT_WRITE_RGB (1<<20) #define BLT_WRITE_RGBA (BLT_WRITE_RGB | BLT_WRITE_A) diff --git a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c index dfacb265a995..be170b10d92f 100644 --- a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c @@ -1136,7 +1136,7 @@ i830_emit_bb_start(struct i915_request *rq, * stable batch scratch bo area (so that the CS never * stumbles over its tlb invalidation bug) ... */ - *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA; + *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096; *cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096; *cs++ = cs_offset;